diff options
Diffstat (limited to 'arch/x86')
269 files changed, 12141 insertions, 26451 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 121f9f03bd5c..08e511657f05 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -38,6 +38,7 @@ config X86_64 select ARCH_HAS_ELFCORE_COMPAT select ZONE_DMA32 select EXECMEM if DYNAMIC_FTRACE + select ACPI_MRRM if ACPI config FORCE_DYNAMIC_FTRACE def_bool y @@ -74,13 +75,11 @@ config X86 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_CPU_ATTACK_VECTORS if CPU_MITIGATIONS select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_CPU_PASID if IOMMU_SVA - select ARCH_HAS_CRC32 - select ARCH_HAS_CRC64 if X86_64 - select ARCH_HAS_CRC_T10DIF select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE @@ -88,7 +87,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE - select ARCH_HAS_EXECMEM_ROX if X86_64 + select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -146,7 +145,7 @@ config X86 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB - select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGE_PMD_SHARE if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 @@ -203,13 +202,13 @@ config X86 select HAVE_ARCH_KFENCE select HAVE_ARCH_KMSAN if X86_64 select HAVE_ARCH_KGDB + select HAVE_ARCH_KSTACK_ERASE select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST - select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 @@ -243,7 +242,6 @@ config X86 select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_FREGS if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE) select HAVE_FUNCTION_TRACER @@ -507,8 +505,9 @@ config X86_MPPARSE config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) - select KERNFS - select PROC_CPU_RESCTRL if PROC_FS + depends on MISC_FILESYSTEMS + select ARCH_HAS_CPU_RESCTRL + select RESCTRL_FS select RESCTRL_FS_PSEUDO_LOCK help Enable x86 CPU resource control support. @@ -526,12 +525,6 @@ config X86_CPU_RESCTRL Say N if unsure. -config RESCTRL_FS_PSEUDO_LOCK - bool - help - Software mechanism to pin data in a cache portion using - micro-architecture specific knowledge. - config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 @@ -1862,8 +1855,7 @@ endchoice config X86_SGX bool "Software Guard eXtensions (SGX)" depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC - depends on CRYPTO=y - depends on CRYPTO_SHA256=y + select CRYPTO_LIB_SHA256 select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI @@ -2010,6 +2002,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG config ARCH_SUPPORTS_KEXEC_JUMP def_bool y +config ARCH_SUPPORTS_KEXEC_HANDOVER + def_bool X86_64 + config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) @@ -2697,6 +2692,15 @@ config MITIGATION_ITS disabled, mitigation cannot be enabled via cmdline. See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst> +config MITIGATION_TSA + bool "Mitigate Transient Scheduler Attacks" + depends on CPU_SUP_AMD + default y + help + Enable mitigation for Transient Scheduler Attacks. TSA is a hardware + security vulnerability on AMD CPUs which can lead to forwarding of + invalid info to subsequent instructions and thus can affect their + timing and thereby cause a leakage. endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 640fcac3af74..3f9fb3698d66 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f4f7b22d8113..3a38fdcdb9bd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,6 +106,11 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a +vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o + +ifdef CONFIG_EFI_SBAT +$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE) +endif $(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f03d59ea6e40..3b0948ad449f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -760,6 +760,49 @@ static void process_e820_entries(unsigned long minimum, } } +/* + * If KHO is active, only process its scratch areas to ensure we are not + * stepping onto preserved memory. + */ +static bool process_kho_entries(unsigned long minimum, unsigned long image_size) +{ + struct kho_scratch *kho_scratch; + struct setup_data *ptr; + struct kho_data *kho; + int i, nr_areas = 0; + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return false; + + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + if (ptr->type == SETUP_KEXEC_KHO) { + kho = (struct kho_data *)(unsigned long)ptr->data; + kho_scratch = (void *)(unsigned long)kho->scratch_addr; + nr_areas = kho->scratch_size / sizeof(*kho_scratch); + break; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + if (!nr_areas) + return false; + + for (i = 0; i < nr_areas; i++) { + struct kho_scratch *area = &kho_scratch[i]; + struct mem_vector region = { + .start = area->addr, + .size = area->size, + }; + + if (process_mem_region(®ion, minimum, image_size)) + break; + } + + return true; +} + static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { @@ -775,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum, return 0; } - if (!process_efi_entries(minimum, image_size)) + /* + * During kexec handover only process KHO scratch areas that are known + * not to contain any data that must be preserved. + */ + if (!process_kho_entries(minimum, image_size) && + !process_efi_entries(minimum, image_size)) process_e820_entries(minimum, image_size); phys_addr = slots_fetch_random(); diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S new file mode 100644 index 000000000000..838f70a997dd --- /dev/null +++ b/arch/x86/boot/compressed/sbat.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Embed SBAT data in the kernel. + */ + .pushsection ".sbat", "a", @progbits + .incbin CONFIG_EFI_SBAT_FILE + .popsection diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 3b2bc61c9408..587ce3e7c504 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -43,6 +43,14 @@ SECTIONS *(.rodata.*) _erodata = . ; } +#ifdef CONFIG_EFI_SBAT + .sbat : ALIGN(0x1000) { + _sbat = . ; + *(.sbat) + _esbat = ALIGN(0x1000); + . = _esbat; + } +#endif .data : ALIGN(0x1000) { _data = . ; *(.data) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index e30649e44d8f..9bea5a1e2c52 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -43,7 +43,7 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .section ".bstext", "ax" #ifdef CONFIG_EFI_STUB # "MZ", MS-DOS header - .word MZ_MAGIC + .word IMAGE_DOS_SIGNATURE .org 0x38 # # Offset to the PE header. @@ -51,16 +51,16 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .long LINUX_PE_MAGIC .long pe_header pe_header: - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE coff_header: #ifdef CONFIG_X86_32 .set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE - .set pe_opt_magic, PE_OPT_MAGIC_PE32 + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC .word IMAGE_FILE_MACHINE_I386 #else .set image_file_add_flags, 0 - .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC .word IMAGE_FILE_MACHINE_AMD64 #endif .word section_count # nr_sections @@ -111,7 +111,7 @@ extra_header_fields: .long salign # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) - .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics + .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit @@ -179,15 +179,11 @@ pecompat_fstart: #else .set pecompat_fstart, setup_size #endif - .ascii ".text" - .byte 0 - .byte 0 - .byte 0 - .long ZO__data - .long setup_size - .long ZO__data # Size of initialized data - # on disk - .long setup_size + .ascii ".text\0\0\0" + .long textsize # VirtualSize + .long setup_size # VirtualAddress + .long textsize # SizeOfRawData + .long setup_size # PointerToRawData .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations @@ -196,6 +192,23 @@ pecompat_fstart: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_EXECUTE # Characteristics +#ifdef CONFIG_EFI_SBAT + .ascii ".sbat\0\0\0" + .long ZO__esbat - ZO__sbat # VirtualSize + .long setup_size + ZO__sbat # VirtualAddress + .long ZO__esbat - ZO__sbat # SizeOfRawData + .long setup_size + ZO__sbat # PointerToRawData + + .long 0, 0, 0 + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_DISCARDABLE # Characteristics + + .set textsize, ZO__sbat +#else + .set textsize, ZO__data +#endif + .ascii ".data\0\0\0" .long ZO__end - ZO__data # VirtualSize .long setup_size + ZO__data # VirtualAddress diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index db3255b979bd..342d79f0ab6a 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -5,5 +5,6 @@ obj-y += core.o sev-nmi.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining UBSAN_SANITIZE_sev-nmi.o := n -# GCC may fail to respect __no_sanitize_address when inlining +# GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining KASAN_SANITIZE_sev-nmi.o := n +KCSAN_SANITIZE_sev-nmi.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b2569257acd3..fc59ce78c477 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -88,7 +88,7 @@ static const char * const sev_status_feat_names[] = { */ static u64 snp_tsc_scale __ro_after_init; static u64 snp_tsc_offset __ro_after_init; -static u64 snp_tsc_freq_khz __ro_after_init; +static unsigned long snp_tsc_freq_khz __ro_after_init; DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); @@ -869,12 +869,12 @@ static void *snp_alloc_vmsa_page(int cpu) return page_address(p + 1); } -static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) +static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned int cpu) { struct sev_es_save_area *cur_vmsa, *vmsa; struct svsm_ca *caa; u8 sipi_vector; - int cpu, ret; + int ret; u64 cr4; /* @@ -895,15 +895,6 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) /* Override start_ip with known protected guest start IP */ start_ip = real_mode_header->sev_es_trampoline_start; - - /* Find the logical CPU for the APIC ID */ - for_each_present_cpu(cpu) { - if (arch_match_cpu_phys_id(cpu, apic_id)) - break; - } - if (cpu >= nr_cpu_ids) - return -EINVAL; - cur_vmsa = per_cpu(sev_vmsa, cpu); /* @@ -1054,11 +1045,13 @@ int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) * This is needed by the OVMF UEFI firmware which will use whatever it finds in * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + * + * When running under SVSM the CA page is needed too, so map it as well. */ -int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { + unsigned long address, pflags, pflags_enc; struct sev_es_runtime_data *data; - unsigned long address, pflags; int cpu; u64 pfn; @@ -1066,6 +1059,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) return 0; pflags = _PAGE_NX | _PAGE_RW; + pflags_enc = cc_mkenc(pflags); for_each_possible_cpu(cpu) { data = per_cpu(runtime_data, cpu); @@ -1075,6 +1069,16 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) return 1; + + if (snp_vmpl) { + address = per_cpu(svsm_caa_pa, cpu); + if (!address) + return 1; + + pfn = address >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags_enc)) + return 1; + } } return 0; @@ -1398,16 +1402,16 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, } EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req); -static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio) +static int snp_issue_guest_request(struct snp_guest_req *req) { + struct snp_req_data *input = &req->input; struct ghcb_state state; struct es_em_ctxt ctxt; unsigned long flags; struct ghcb *ghcb; int ret; - rio->exitinfo2 = SEV_RET_NO_FW_CALL; + req->exitinfo2 = SEV_RET_NO_FW_CALL; /* * __sev_get_ghcb() needs to run with IRQs disabled because it is using @@ -1432,8 +1436,8 @@ static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_dat if (ret) goto e_put; - rio->exitinfo2 = ghcb->save.sw_exit_info_2; - switch (rio->exitinfo2) { + req->exitinfo2 = ghcb->save.sw_exit_info_2; + switch (req->exitinfo2) { case 0: break; @@ -1462,11 +1466,74 @@ e_restore_irq: return ret; } +/** + * snp_svsm_vtpm_probe() - Probe if SVSM provides a vTPM device + * + * Check that there is SVSM and that it supports at least TPM_SEND_COMMAND + * which is the only request used so far. + * + * Return: true if the platform provides a vTPM SVSM device, false otherwise. + */ +static bool snp_svsm_vtpm_probe(void) +{ + struct svsm_call call = {}; + + /* The vTPM device is available only if a SVSM is present */ + if (!snp_vmpl) + return false; + + call.caa = svsm_get_caa(); + call.rax = SVSM_VTPM_CALL(SVSM_VTPM_QUERY); + + if (svsm_perform_call_protocol(&call)) + return false; + + /* Check platform commands contains TPM_SEND_COMMAND - platform command 8 */ + return call.rcx_out & BIT_ULL(8); +} + +/** + * snp_svsm_vtpm_send_command() - Execute a vTPM operation on SVSM + * @buffer: A buffer used to both send the command and receive the response. + * + * Execute a SVSM_VTPM_CMD call as defined by + * "Secure VM Service Module for SEV-SNP Guests" Publication # 58019 Revision: 1.00 + * + * All command request/response buffers have a common structure as specified by + * the following table: + * Byte Size    In/Out    Description + * Offset    (Bytes) + * 0x000     4          In        Platform command + *                        Out       Platform command response size + * + * Each command can build upon this common request/response structure to create + * a structure specific to the command. See include/linux/tpm_svsm.h for more + * details. + * + * Return: 0 on success, -errno on failure + */ +int snp_svsm_vtpm_send_command(u8 *buffer) +{ + struct svsm_call call = {}; + + call.caa = svsm_get_caa(); + call.rax = SVSM_VTPM_CALL(SVSM_VTPM_CMD); + call.rcx = __pa(buffer); + + return svsm_perform_call_protocol(&call); +} +EXPORT_SYMBOL_GPL(snp_svsm_vtpm_send_command); + static struct platform_device sev_guest_device = { .name = "sev-guest", .id = -1, }; +static struct platform_device tpm_svsm_device = { + .name = "tpm-svsm", + .id = -1, +}; + static int __init snp_init_platform_device(void) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) @@ -1475,7 +1542,11 @@ static int __init snp_init_platform_device(void) if (platform_device_register(&sev_guest_device)) return -ENODEV; - pr_info("SNP guest platform device initialized.\n"); + if (snp_svsm_vtpm_probe() && + platform_device_register(&tpm_svsm_device)) + return -ENODEV; + + pr_info("SNP guest platform devices initialized.\n"); return 0; } device_initcall(snp_init_platform_device); @@ -1861,8 +1932,7 @@ static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_r return 0; } -static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) +static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req) { unsigned long req_start = jiffies; unsigned int override_npages = 0; @@ -1876,7 +1946,7 @@ retry_request: * sequence number must be incremented or the VMPCK must be deleted to * prevent reuse of the IV. */ - rc = snp_issue_guest_request(req, &req->input, rio); + rc = snp_issue_guest_request(req); switch (rc) { case -ENOSPC: /* @@ -1929,7 +1999,7 @@ retry_request: snp_inc_msg_seqno(mdesc); if (override_err) { - rio->exitinfo2 = override_err; + req->exitinfo2 = override_err; /* * If an extended guest request was issued and the supplied certificate @@ -1947,12 +2017,20 @@ retry_request: return rc; } -int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req) { u64 seqno; int rc; + /* + * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW. + * The offload's DMA SG list of data to encrypt has to be in linear mapping. + */ + if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) { + pr_warn("AES-GSM buffers must be in linear mapping"); + return -EINVAL; + } + guard(mutex)(&snp_cmd_mutex); /* Check if the VMPCK is not empty */ @@ -1985,14 +2063,14 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req req->input.resp_gpa = __pa(mdesc->response); req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0; - rc = __handle_guest_request(mdesc, req, rio); + rc = __handle_guest_request(mdesc, req); if (rc) { if (rc == -EIO && - rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) return rc; pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", - rc, rio->exitinfo2); + rc, req->exitinfo2); snp_disable_vmpck(mdesc); return rc; @@ -2011,11 +2089,10 @@ EXPORT_SYMBOL_GPL(snp_send_guest_request); static int __init snp_get_tsc_info(void) { - struct snp_guest_request_ioctl *rio; struct snp_tsc_info_resp *tsc_resp; struct snp_tsc_info_req *tsc_req; struct snp_msg_desc *mdesc; - struct snp_guest_req *req; + struct snp_guest_req req = {}; int rc = -ENOMEM; tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL); @@ -2031,32 +2108,24 @@ static int __init snp_get_tsc_info(void) if (!tsc_resp) goto e_free_tsc_req; - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - goto e_free_tsc_resp; - - rio = kzalloc(sizeof(*rio), GFP_KERNEL); - if (!rio) - goto e_free_req; - mdesc = snp_msg_alloc(); if (IS_ERR_OR_NULL(mdesc)) - goto e_free_rio; + goto e_free_tsc_resp; rc = snp_msg_init(mdesc, snp_vmpl); if (rc) goto e_free_mdesc; - req->msg_version = MSG_HDR_VER; - req->msg_type = SNP_MSG_TSC_INFO_REQ; - req->vmpck_id = snp_vmpl; - req->req_buf = tsc_req; - req->req_sz = sizeof(*tsc_req); - req->resp_buf = (void *)tsc_resp; - req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; - req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + req.msg_version = MSG_HDR_VER; + req.msg_type = SNP_MSG_TSC_INFO_REQ; + req.vmpck_id = snp_vmpl; + req.req_buf = tsc_req; + req.req_sz = sizeof(*tsc_req); + req.resp_buf = (void *)tsc_resp; + req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; + req.exit_code = SVM_VMGEXIT_GUEST_REQUEST; - rc = snp_send_guest_request(mdesc, req, rio); + rc = snp_send_guest_request(mdesc, &req); if (rc) goto e_request; @@ -2077,11 +2146,7 @@ e_request: memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN); e_free_mdesc: snp_msg_free(mdesc); -e_free_rio: - kfree(rio); -e_free_req: - kfree(req); - e_free_tsc_resp: +e_free_tsc_resp: kfree(tsc_resp); e_free_tsc_req: kfree(tsc_req); @@ -2109,15 +2174,31 @@ static unsigned long securetsc_get_tsc_khz(void) void __init snp_secure_tsc_init(void) { - unsigned long long tsc_freq_mhz; + struct snp_secrets_page *secrets; + unsigned long tsc_freq_mhz; + void *mem; if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) return; + mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC); + } + + secrets = (__force struct snp_secrets_page *)mem; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz); - snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000); + + /* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */ + tsc_freq_mhz &= GENMASK_ULL(17, 0); + + snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor); x86_platform.calibrate_cpu = securetsc_get_tsc_khz; x86_platform.calibrate_tsc = securetsc_get_tsc_khz; + + early_memunmap(mem, PAGE_SIZE); } diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index 0989d98da130..faf1fce89ed4 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/io.h> #include <linux/psp-sev.h> +#include <linux/efi.h> #include <uapi/linux/sev-guest.h> #include <asm/init.h> @@ -178,9 +179,15 @@ static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) return ES_OK; } +/* + * User instruction decoding is also required for the EFI runtime. Even though + * the EFI runtime is running in kernel mode, it uses special EFI virtual + * address mappings that require the use of efi_mm to properly address and + * decode. + */ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) { - if (user_mode(ctxt->regs)) + if (user_mode(ctxt->regs) || mm_is_efi(current->active_mm)) return __vc_decode_user_insn(ctxt); else return __vc_decode_kern_insn(ctxt); diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index edab6d6049be..7b2833705d47 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -36,6 +36,7 @@ /* TDX Module call error codes */ #define TDCALL_RETURN_CODE(a) ((a) >> 32) #define TDCALL_INVALID_OPERAND 0xc0000100 +#define TDCALL_OPERAND_BUSY 0x80000200 #define TDREPORT_SUBTYPE_0 0 @@ -109,12 +110,13 @@ static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) * REPORTDATA to be included into TDREPORT. * @tdreport: Address of the output buffer to store TDREPORT. * - * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module - * v1.0 specification for more information on TDG.MR.REPORT TDCALL. + * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.REPORT TDCALL. + * * It is used in the TDX guest driver module to get the TDREPORT0. * - * Return 0 on success, -EINVAL for invalid operands, or -EIO on - * other TDCALL failures. + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. */ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) { @@ -128,7 +130,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) ret = __tdcall(TDG_MR_REPORT, &args); if (ret) { if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) - return -EINVAL; + return -ENXIO; + else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; return -EIO; } @@ -137,6 +141,42 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); /** + * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using + * TDG.MR.RTMR.EXTEND TDCALL. + * @index: Index of RTMR register to be extended. + * @data: Address of the input buffer with RTMR register extend data. + * + * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.RTMR.EXTEND TDCALL. + * + * It is used in the TDX guest driver module to allow user to extend the RTMR + * registers. + * + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. + */ +int tdx_mcall_extend_rtmr(u8 index, u8 *data) +{ + struct tdx_module_args args = { + .rcx = virt_to_phys(data), + .rdx = index, + }; + u64 ret; + + ret = __tdcall(TDG_MR_RTMR_EXTEND, &args); + if (ret) { + if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) + return -ENXIO; + if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr); + +/** * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote * hypercall. * @buf: Address of the directly mapped shared kernel buffer which diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 7cd2f395f301..79fa38ca954d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -27,10 +27,12 @@ CONFIG_CGROUP_DEBUG=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y +CONFIG_KEXEC=y +# Do not remove this as it results in non-bootable kernels +# CONFIG_64BIT is not set CONFIG_SMP=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_NR_CPUS=8 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y @@ -39,9 +41,6 @@ CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_HZ_1000=y -CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y -# CONFIG_MITIGATION_RETHUNK is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y @@ -52,7 +51,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y -CONFIG_COMPAT_32BIT_TIME=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -63,9 +61,7 @@ CONFIG_BINFMT_MISC=y # CONFIG_COMPAT_BRK is not set CONFIG_NET=y CONFIG_PACKET=y -CONFIG_UNIX=y CONFIG_XFRM_USER=y -CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y @@ -134,7 +130,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_CAPSULE_LOADER=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y @@ -210,7 +205,6 @@ CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y CONFIG_HID_GYRATION=y -CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y CONFIG_HID_PANTHERLORD=y CONFIG_PANTHERLORD_FF=y @@ -241,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y @@ -266,19 +259,13 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y -CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_WX=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_UNWINDER_FRAME_POINTER=y CONFIG_DEBUG_ENTRY=y -# CONFIG_64BIT is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 61e25f6209ed..7d7310cdf8b0 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -27,6 +27,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y +CONFIG_KEXEC=y CONFIG_SMP=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y @@ -40,8 +41,6 @@ CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_EFI_MIXED=y CONFIG_HZ_1000=y -CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y @@ -63,9 +62,7 @@ CONFIG_BINFMT_MISC=y # CONFIG_COMPAT_BRK is not set CONFIG_NET=y CONFIG_PACKET=y -CONFIG_UNIX=y CONFIG_XFRM_USER=y -CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y @@ -205,7 +202,6 @@ CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y CONFIG_HID_GYRATION=y -CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y CONFIG_HID_PANTHERLORD=y CONFIG_PANTHERLORD_FF=y @@ -239,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y @@ -264,13 +259,11 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_WX=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 56cfdc79e2c6..94016c60561e 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -376,33 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_SHA1_SSSE3 - tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on 64BIT - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - -config CRYPTO_SHA512_SSSE3 - tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" - depends on 64BIT - select CRYPTO_SHA512 - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index aa289a9e0153..d402963d6b57 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -51,12 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif -obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o - -obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o -sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index f1b6d40154e3..f1adfba1a76e 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -104,10 +104,12 @@ static void crypto_aegis128_aesni_process_ad( } } -static __always_inline void +static __always_inline int crypto_aegis128_aesni_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, bool enc) { + int err = 0; + while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { if (enc) aegis128_aesni_enc(state, walk->src.virt.addr, @@ -119,7 +121,10 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state, walk->dst.virt.addr, round_down(walk->nbytes, AEGIS128_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); + kernel_fpu_end(); + err = skcipher_walk_done(walk, + walk->nbytes % AEGIS128_BLOCK_SIZE); + kernel_fpu_begin(); } if (walk->nbytes) { @@ -131,8 +136,11 @@ crypto_aegis128_aesni_process_crypt(struct aegis_state *state, aegis128_aesni_dec_tail(state, walk->src.virt.addr, walk->dst.virt.addr, walk->nbytes); - skcipher_walk_done(walk, 0); + kernel_fpu_end(); + err = skcipher_walk_done(walk, 0); + kernel_fpu_begin(); } + return err; } static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) @@ -165,7 +173,7 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, return 0; } -static __always_inline void +static __always_inline int crypto_aegis128_aesni_crypt(struct aead_request *req, struct aegis_block *tag_xor, unsigned int cryptlen, bool enc) @@ -174,20 +182,24 @@ crypto_aegis128_aesni_crypt(struct aead_request *req, struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; + int err; if (enc) - skcipher_walk_aead_encrypt(&walk, req, true); + err = skcipher_walk_aead_encrypt(&walk, req, false); else - skcipher_walk_aead_decrypt(&walk, req, true); + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; kernel_fpu_begin(); aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128_aesni_process_crypt(&state, &walk, enc); - aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); - + err = crypto_aegis128_aesni_process_crypt(&state, &walk, enc); + if (err == 0) + aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); + return err; } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) @@ -196,8 +208,11 @@ static int crypto_aegis128_aesni_encrypt(struct aead_request *req) struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; + int err; - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); + err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); + if (err) + return err; scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); @@ -212,11 +227,14 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; + int err; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); + err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); + if (err) + return err; return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c index b4bddcd58457..007b250f774c 100644 --- a/arch/x86/crypto/aria_aesni_avx2_glue.c +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -9,6 +9,7 @@ #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index ab9b38d05332..4c88ef4eba82 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -9,6 +9,7 @@ #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index a7d162388142..5c321f255eb7 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -8,6 +8,7 @@ #include <crypto/algapi.h> #include <linux/crypto.h> #include <linux/err.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 3bd37d664121..cbede120e5f2 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -10,6 +10,7 @@ #include <linux/unaligned.h> #include <linux/crypto.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index dcfc0de333de..d587f05c3c8c 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -7,6 +7,7 @@ #include <crypto/curve25519.h> #include <crypto/internal/kpp.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index e640abc1cb8a..9c8b3a335d5c 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <linux/export.h> #include <crypto/algapi.h> #include <crypto/serpent.h> diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S deleted file mode 100644 index 4b49bdc95265..000000000000 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Implement fast SHA-1 with AVX2 instructions. (x86_64) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Ilya Albrekht <ilya.albrekht@intel.com> - * Maxim Locktyukhin <maxim.locktyukhin@intel.com> - * Ronen Zohar <ronen.zohar@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. - * - *This implementation is based on the previous SSSE3 release: - *Visit http://software.intel.com/en-us/articles/ - *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ - * - *Updates 20-byte SHA-1 record at start of 'state', from 'input', for - *even number of 'blocks' consecutive 64-byte blocks. - * - *extern "C" void sha1_transform_avx2( - * struct sha1_state *state, const u8* input, int blocks ); - */ - -#include <linux/linkage.h> - -#define CTX %rdi /* arg1 */ -#define BUF %rsi /* arg2 */ -#define CNT %rdx /* arg3 */ - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %eax -#define REG_E %edx -#define REG_TB %ebx -#define REG_TA %r12d -#define REG_RA %rcx -#define REG_RB %rsi -#define REG_RC %rdi -#define REG_RD %rax -#define REG_RE %rdx -#define REG_RTA %r12 -#define REG_RTB %rbx -#define REG_T1 %r11d -#define xmm_mov vmovups -#define avx2_zeroupper vzeroupper -#define RND_F1 1 -#define RND_F2 2 -#define RND_F3 3 - -.macro REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set TB, REG_TB - .set TA, REG_TA - - .set RA, REG_RA - .set RB, REG_RB - .set RC, REG_RC - .set RD, REG_RD - .set RE, REG_RE - - .set RTA, REG_RTA - .set RTB, REG_RTB - - .set T1, REG_T1 -.endm - -#define HASH_PTR %r9 -#define BLOCKS_CTR %r8 -#define BUFFER_PTR %r10 -#define BUFFER_PTR2 %r13 - -#define PRECALC_BUF %r14 -#define WK_BUF %r15 - -#define W_TMP %xmm0 -#define WY_TMP %ymm0 -#define WY_TMP2 %ymm9 - -# AVX2 variables -#define WY0 %ymm3 -#define WY4 %ymm5 -#define WY08 %ymm7 -#define WY12 %ymm8 -#define WY16 %ymm12 -#define WY20 %ymm13 -#define WY24 %ymm14 -#define WY28 %ymm15 - -#define YMM_SHUFB_BSWAP %ymm10 - -/* - * Keep 2 iterations precalculated at a time: - * - 80 DWORDs per iteration * 2 - */ -#define W_SIZE (80*2*2 +16) - -#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) -#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) - - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -.macro PRECALC_RESET_WY - .set WY_00, WY0 - .set WY_04, WY4 - .set WY_08, WY08 - .set WY_12, WY12 - .set WY_16, WY16 - .set WY_20, WY20 - .set WY_24, WY24 - .set WY_28, WY28 - .set WY_32, WY_00 -.endm - -.macro PRECALC_ROTATE_WY - /* Rotate macros */ - .set WY_32, WY_28 - .set WY_28, WY_24 - .set WY_24, WY_20 - .set WY_20, WY_16 - .set WY_16, WY_12 - .set WY_12, WY_08 - .set WY_08, WY_04 - .set WY_04, WY_00 - .set WY_00, WY_32 - - /* Define register aliases */ - .set WY, WY_00 - .set WY_minus_04, WY_04 - .set WY_minus_08, WY_08 - .set WY_minus_12, WY_12 - .set WY_minus_16, WY_16 - .set WY_minus_20, WY_20 - .set WY_minus_24, WY_24 - .set WY_minus_28, WY_28 - .set WY_minus_32, WY -.endm - -.macro PRECALC_00_15 - .if (i == 0) # Initialize and rotate registers - PRECALC_RESET_WY - PRECALC_ROTATE_WY - .endif - - /* message scheduling pre-compute for rounds 0-15 */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vmovdqu (i * 2)(BUFFER_PTR), W_TMP - .elseif ((i & 7) == 1) - vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ - WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY - .elseif ((i & 7) == 4) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - .elseif ((i & 7) == 7) - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_16_31 - /* - * message scheduling pre-compute for rounds 16-31 - * calculating last 32 w[i] values in 8 XMM registers - * pre-calculate K+w[i] values and store to mem - * for later load by ALU add instruction - * - * "brute force" vectorization for rounds 16-31 only - * due to w[i]->w[i-3] dependency - */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - /* w[i-14] */ - vpalignr $8, WY_minus_16, WY_minus_12, WY - vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ - .elseif ((i & 7) == 1) - vpxor WY_minus_08, WY, WY - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpxor WY_TMP, WY, WY - vpslldq $12, WY, WY_TMP2 - .elseif ((i & 7) == 3) - vpslld $1, WY, WY_TMP - vpsrld $31, WY, WY - .elseif ((i & 7) == 4) - vpor WY, WY_TMP, WY_TMP - vpslld $2, WY_TMP2, WY - .elseif ((i & 7) == 5) - vpsrld $30, WY_TMP2, WY_TMP2 - vpxor WY, WY_TMP, WY_TMP - .elseif ((i & 7) == 7) - vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_32_79 - /* - * in SHA-1 specification: - * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: - * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization - * since w[i]=>w[i-3] dependency is broken - */ - - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP - .elseif ((i & 7) == 1) - /* W is W_minus_32 before xor */ - vpxor WY_minus_28, WY, WY - .elseif ((i & 7) == 2) - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 3) - vpxor WY_TMP, WY, WY - .elseif ((i & 7) == 4) - vpslld $2, WY, WY_TMP - .elseif ((i & 7) == 5) - vpsrld $30, WY, WY - vpor WY, WY_TMP, WY - .elseif ((i & 7) == 7) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC r, s - .set i, \r - - .if (i < 40) - .set K_XMM, 32*0 - .elseif (i < 80) - .set K_XMM, 32*1 - .elseif (i < 120) - .set K_XMM, 32*2 - .else - .set K_XMM, 32*3 - .endif - - .if (i<32) - PRECALC_00_15 \s - .elseif (i<64) - PRECALC_16_31 \s - .elseif (i < 160) - PRECALC_32_79 \s - .endif -.endm - -.macro ROTATE_STATE - .set T_REG, E - .set E, D - .set D, C - .set C, B - .set B, TB - .set TB, A - .set A, T_REG - - .set T_REG, RE - .set RE, RD - .set RD, RC - .set RC, RB - .set RB, RTB - .set RTB, RA - .set RA, T_REG -.endm - -/* Macro relies on saved ROUND_Fx */ - -.macro RND_FUN f, r - .if (\f == RND_F1) - ROUND_F1 \r - .elseif (\f == RND_F2) - ROUND_F2 \r - .elseif (\f == RND_F3) - ROUND_F3 \r - .endif -.endm - -.macro RR r - .set round_id, (\r % 80) - - .if (round_id == 0) /* Precalculate F for first round */ - .set ROUND_FUNC, RND_F1 - mov B, TB - - rorx $(32-30), B, B /* b>>>2 */ - andn D, TB, T1 - and C, TB - xor T1, TB - .endif - - RND_FUN ROUND_FUNC, \r - ROTATE_STATE - - .if (round_id == 18) - .set ROUND_FUNC, RND_F2 - .elseif (round_id == 38) - .set ROUND_FUNC, RND_F3 - .elseif (round_id == 58) - .set ROUND_FUNC, RND_F2 - .endif - - .set round_id, ( (\r+1) % 80) - - RND_FUN ROUND_FUNC, (\r+1) - ROTATE_STATE -.endm - -.macro ROUND_F1 r - add WK(\r), E - - andn C, A, T1 /* ~b&d */ - lea (RE,RTB), E /* Add F from the previous round */ - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30),A, TB /* b>>>2 for next round */ - - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - /* - * Calculate F for the next round - * (b & c) ^ andn[b, d] - */ - and B, A /* b&c */ - xor T1, A /* F1 = (b&c) ^ (~b&d) */ - - lea (RE,RTA), E /* E += A >>> 5 */ -.endm - -.macro ROUND_F2 r - add WK(\r), E - lea (RE,RTB), E /* Add F from the previous round */ - - /* Calculate F for the next round */ - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - .if ((round_id) < 79) - rorx $(32-30), A, TB /* b>>>2 for next round */ - .endif - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - .if ((round_id) < 79) - xor B, A - .endif - - add TA, E /* E += A >>> 5 */ - - .if ((round_id) < 79) - xor C, A - .endif -.endm - -.macro ROUND_F3 r - add WK(\r), E - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - lea (RE,RTB), E /* Add F from the previous round */ - - mov B, T1 - or A, T1 - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30), A, TB /* b>>>2 for next round */ - - /* Calculate F for the next round - * (b and c) or (d and (b or c)) - */ - and C, T1 - and B, A - or T1, A - - add TA, E /* E += A >>> 5 */ - -.endm - -/* Add constant only if (%2 > %3) condition met (uses RTA as temp) - * %1 + %2 >= %3 ? %4 : 0 - */ -.macro ADD_IF_GE a, b, c, d - mov \a, RTA - add $\d, RTA - cmp $\c, \b - cmovge RTA, \a -.endm - -/* - * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining - */ -.macro SHA1_PIPELINED_MAIN_BODY - - REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - mov %rsp, PRECALC_BUF - lea (2*4*80+32)(%rsp), WK_BUF - - # Precalc WK for first 2 blocks - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 - .set i, 0 - .rept 160 - PRECALC i - .set i, i + 1 - .endr - - /* Go to next block if needed */ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - xchg WK_BUF, PRECALC_BUF - - .align 32 -.L_loop: - /* - * code loops through more than one block - * we use K_BASE value as a signal of a last block, - * it is set below by: cmovae BUFFER_PTR, K_BASE - */ - test BLOCKS_CTR, BLOCKS_CTR - jnz .L_begin - .align 32 - jmp .L_end - .align 32 -.L_begin: - - /* - * Do first block - * rounds: 0,2,4,6,8 - */ - .set j, 0 - .rept 5 - RR j - .set j, j+2 - .endr - - /* - * rounds: - * 10,12,14,16,18 - * 20,22,24,26,28 - * 30,32,34,36,38 - * 40,42,44,46,48 - * 50,52,54,56,58 - */ - .rept 25 - RR j - .set j, j+2 - .endr - - /* Update Counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 - /* - * rounds - * 60,62,64,66,68 - * 70,72,74,76,78 - */ - .rept 10 - RR j - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - test BLOCKS_CTR, BLOCKS_CTR - jz .L_loop - - mov TB, B - - /* Process second block */ - /* - * rounds - * 0+80, 2+80, 4+80, 6+80, 8+80 - * 10+80,12+80,14+80,16+80,18+80 - */ - - .set j, 0 - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 20+80,22+80,24+80,26+80,28+80 - * 30+80,32+80,34+80,36+80,38+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 40+80,42+80,44+80,46+80,48+80 - * 50+80,52+80,54+80,56+80,58+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* update counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - - /* - * rounds - * 60+80,62+80,64+80,66+80,68+80 - * 70+80,72+80,74+80,76+80,78+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - /* Reset state for AVX2 reg permutation */ - mov A, TA - mov TB, A - mov C, TB - mov E, C - mov D, B - mov TA, D - - REGALLOC - - xchg WK_BUF, PRECALC_BUF - - jmp .L_loop - - .align 32 -.L_end: - -.endm -/* - * macro implements SHA-1 function's body for several 64-byte blocks - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) - - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - RESERVE_STACK = (W_SIZE*4 + 8+24) - - /* Align stack */ - push %rbp - mov %rsp, %rbp - and $~(0x20-1), %rsp - sub $RESERVE_STACK, %rsp - - avx2_zeroupper - - /* Setup initial values */ - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - mov BUF, BUFFER_PTR2 - mov CNT, BLOCKS_CTR - - xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - avx2_zeroupper - - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - RET - - SYM_FUNC_END(\name) -.endm - -.section .rodata - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.align 128 -K_XMM_AR: - .long K1, K1, K1, K1 - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f -.text - -SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S deleted file mode 100644 index cade913d4882..000000000000 --- a/arch/x86/crypto/sha1_ni_asm.S +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -/* gcc conversion */ -#define FRAME_SIZE 32 /* space for 2x16 bytes */ - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define SHUF_MASK %xmm7 - - -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha1_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks) - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ -.text -SYM_TYPED_FUNC_START(sha1_ni_transform) - push %rbp - mov %rsp, %rbp - sub $FRAME_SIZE, %rsp - and $~0xF, %rsp - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* load initial hash values */ - pinsrd $3, 1*16(DIGEST_PTR), E0 - movdqu 0*16(DIGEST_PTR), ABCD - pand UPPER_WORD_MASK(%rip), E0 - pshufd $0x1B, ABCD, ABCD - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa E0, (0*16)(%rsp) - movdqa ABCD, (1*16)(%rsp) - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG0 - pshufb SHUF_MASK, MSG0 - paddd MSG0, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG1 - pshufb SHUF_MASK, MSG1 - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG2 - pshufb SHUF_MASK, MSG2 - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG3 - pshufb SHUF_MASK, MSG3 - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - pxor MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte (0*16)(%rsp), E0 - paddd (1*16)(%rsp), ABCD - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, ABCD, ABCD - movdqu ABCD, 0*16(DIGEST_PTR) - pextrd $3, E0, 1*16(DIGEST_PTR) - -.Ldone_hash: - mov %rbp, %rsp - pop %rbp - - RET -SYM_FUNC_END(sha1_ni_transform) - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 -.align 16 -UPPER_WORD_MASK: - .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S deleted file mode 100644 index f54988c80eb4..000000000000 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ /dev/null @@ -1,554 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental - * SSE3 instruction set extensions introduced in Intel Core Microarchitecture - * processors. CPUs supporting Intel(R) AVX extensions will get an additional - * boost. - * - * This work was inspired by the vectorized implementation of Dean Gaudet. - * Additional information on it can be found at: - * http://www.arctic.org/~dean/crypto/sha1.html - * - * It was improved upon with more efficient vectorization of the message - * scheduling. This implementation has also been optimized for all current and - * several future generations of Intel CPUs. - * - * See this article for more information about the implementation details: - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ - * - * Copyright (C) 2010, Intel Corp. - * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> - * Ronen Zohar <ronen.zohar@intel.com> - * - * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: - * Author: Mathias Krause <minipli@googlemail.com> - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define CTX %rdi // arg1 -#define BUF %rsi // arg2 -#define CNT %rdx // arg3 - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %r12d -#define REG_E %edx - -#define REG_T1 %eax -#define REG_T2 %ebx - -#define K_BASE %r8 -#define HASH_PTR %r9 -#define BUFFER_PTR %r10 -#define BUFFER_END %r11 - -#define W_TMP1 %xmm0 -#define W_TMP2 %xmm9 - -#define W0 %xmm1 -#define W4 %xmm2 -#define W8 %xmm3 -#define W12 %xmm4 -#define W16 %xmm5 -#define W20 %xmm6 -#define W24 %xmm7 -#define W28 %xmm8 - -#define XMM_SHUFB_BSWAP %xmm10 - -/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ -#define WK(t) (((t) & 15) * 4)(%rsp) -#define W_PRECALC_AHEAD 16 - -/* - * This macro implements the SHA-1 function's body for single 64-byte block - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_TYPED_FUNC_START(\name) - - push %rbx - push %r12 - push %rbp - mov %rsp, %rbp - - sub $64, %rsp # allocate workspace - and $~15, %rsp # align stack - - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - shl $6, CNT # multiply by 64 - add BUF, CNT - mov CNT, BUFFER_END - - lea K_XMM_AR(%rip), K_BASE - xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - # cleanup workspace - mov $8, %ecx - mov %rsp, %rdi - xor %eax, %eax - rep stosq - - mov %rbp, %rsp # deallocate workspace - pop %rbp - pop %r12 - pop %rbx - RET - - SYM_FUNC_END(\name) -.endm - -/* - * This macro implements 80 rounds of SHA-1 for one 64-byte block - */ -.macro SHA1_PIPELINED_MAIN_BODY - INIT_REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - .set i, 0 - .rept W_PRECALC_AHEAD - W_PRECALC i - .set i, (i+1) - .endr - -.align 4 -1: - RR F1,A,B,C,D,E,0 - RR F1,D,E,A,B,C,2 - RR F1,B,C,D,E,A,4 - RR F1,E,A,B,C,D,6 - RR F1,C,D,E,A,B,8 - - RR F1,A,B,C,D,E,10 - RR F1,D,E,A,B,C,12 - RR F1,B,C,D,E,A,14 - RR F1,E,A,B,C,D,16 - RR F1,C,D,E,A,B,18 - - RR F2,A,B,C,D,E,20 - RR F2,D,E,A,B,C,22 - RR F2,B,C,D,E,A,24 - RR F2,E,A,B,C,D,26 - RR F2,C,D,E,A,B,28 - - RR F2,A,B,C,D,E,30 - RR F2,D,E,A,B,C,32 - RR F2,B,C,D,E,A,34 - RR F2,E,A,B,C,D,36 - RR F2,C,D,E,A,B,38 - - RR F3,A,B,C,D,E,40 - RR F3,D,E,A,B,C,42 - RR F3,B,C,D,E,A,44 - RR F3,E,A,B,C,D,46 - RR F3,C,D,E,A,B,48 - - RR F3,A,B,C,D,E,50 - RR F3,D,E,A,B,C,52 - RR F3,B,C,D,E,A,54 - RR F3,E,A,B,C,D,56 - RR F3,C,D,E,A,B,58 - - add $64, BUFFER_PTR # move to the next 64-byte block - cmp BUFFER_END, BUFFER_PTR # if the current is the last one use - cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun - - RR F4,A,B,C,D,E,60 - RR F4,D,E,A,B,C,62 - RR F4,B,C,D,E,A,64 - RR F4,E,A,B,C,D,66 - RR F4,C,D,E,A,B,68 - - RR F4,A,B,C,D,E,70 - RR F4,D,E,A,B,C,72 - RR F4,B,C,D,E,A,74 - RR F4,E,A,B,C,D,76 - RR F4,C,D,E,A,B,78 - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), B - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - RESTORE_RENAMED_REGS - cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end - jne 1b -.endm - -.macro INIT_REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set T1, REG_T1 - .set T2, REG_T2 -.endm - -.macro RESTORE_RENAMED_REGS - # order is important (REG_C is where it should be) - mov B, REG_B - mov D, REG_D - mov A, REG_A - mov E, REG_E -.endm - -.macro SWAP_REG_NAMES a, b - .set _T, \a - .set \a, \b - .set \b, _T -.endm - -.macro F1 b, c, d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - xor \d, T1 - and \b, T1 - xor \d, T1 -.endm - -.macro F2 b, c, d - mov \d, T1 - SWAP_REG_NAMES \d, T1 - xor \c, T1 - xor \b, T1 -.endm - -.macro F3 b, c ,d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - mov \b, T2 - or \b, T1 - and \c, T2 - and \d, T1 - or T2, T1 -.endm - -.macro F4 b, c, d - F2 \b, \c, \d -.endm - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -/* - * RR does two rounds of SHA-1 back to back with W[] pre-calc - * t1 = F(b, c, d); e += w(i) - * e += t1; b <<= 30; d += w(i+1); - * t1 = F(a, b, c); - * d += t1; a <<= 5; - * e += a; - * t1 = e; a >>= 7; - * t1 <<= 5; - * d += t1; - */ -.macro RR F, a, b, c, d, e, round - add WK(\round), \e - \F \b, \c, \d # t1 = F(b, c, d); - W_PRECALC (\round + W_PRECALC_AHEAD) - rol $30, \b - add T1, \e - add WK(\round + 1), \d - - \F \a, \b, \c - W_PRECALC (\round + W_PRECALC_AHEAD + 1) - rol $5, \a - add \a, \e - add T1, \d - ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) - - mov \e, T1 - SWAP_REG_NAMES \e, T1 - - rol $5, T1 - add T1, \d - - # write: \a, \b - # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c -.endm - -.macro W_PRECALC r - .set i, \r - - .if (i < 20) - .set K_XMM, 0 - .elseif (i < 40) - .set K_XMM, 16 - .elseif (i < 60) - .set K_XMM, 32 - .elseif (i < 80) - .set K_XMM, 48 - .endif - - .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) - .set i, ((\r) % 80) # pre-compute for the next iteration - .if (i == 0) - W_PRECALC_RESET - .endif - W_PRECALC_00_15 - .elseif (i<32) - W_PRECALC_16_31 - .elseif (i < 80) // rounds 32-79 - W_PRECALC_32_79 - .endif -.endm - -.macro W_PRECALC_RESET - .set W, W0 - .set W_minus_04, W4 - .set W_minus_08, W8 - .set W_minus_12, W12 - .set W_minus_16, W16 - .set W_minus_20, W20 - .set W_minus_24, W24 - .set W_minus_28, W28 - .set W_minus_32, W -.endm - -.macro W_PRECALC_ROTATE - .set W_minus_32, W_minus_28 - .set W_minus_28, W_minus_24 - .set W_minus_24, W_minus_20 - .set W_minus_20, W_minus_16 - .set W_minus_16, W_minus_12 - .set W_minus_12, W_minus_08 - .set W_minus_08, W_minus_04 - .set W_minus_04, W - .set W, W_minus_32 -.endm - -.macro W_PRECALC_SSSE3 - -.macro W_PRECALC_00_15 - W_PRECALC_00_15_SSSE3 -.endm -.macro W_PRECALC_16_31 - W_PRECALC_16_31_SSSE3 -.endm -.macro W_PRECALC_32_79 - W_PRECALC_32_79_SSSE3 -.endm - -/* message scheduling pre-compute for rounds 0-15 */ -.macro W_PRECALC_00_15_SSSE3 - .if ((i & 3) == 0) - movdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - pshufb XMM_SHUFB_BSWAP, W_TMP1 - movdqa W_TMP1, W - .elseif ((i & 3) == 2) - paddd (K_BASE), W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 16-31 - * - * - calculating last 32 w[i] values in 8 XMM registers - * - pre-calculate K+w[i] values and store to mem, for later load by ALU add - * instruction - * - * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] - * dependency, but improves for 32-79 - */ -.macro W_PRECALC_16_31_SSSE3 - # blended scheduling of vector and scalar instruction streams, one 4-wide - # vector iteration / 4 scalar rounds - .if ((i & 3) == 0) - movdqa W_minus_12, W - palignr $8, W_minus_16, W # w[i-14] - movdqa W_minus_04, W_TMP1 - psrldq $4, W_TMP1 # w[i-3] - pxor W_minus_08, W - .elseif ((i & 3) == 1) - pxor W_minus_16, W_TMP1 - pxor W_TMP1, W - movdqa W, W_TMP2 - movdqa W, W_TMP1 - pslldq $12, W_TMP2 - .elseif ((i & 3) == 2) - psrld $31, W - pslld $1, W_TMP1 - por W, W_TMP1 - movdqa W_TMP2, W - psrld $30, W_TMP2 - pslld $2, W - .elseif ((i & 3) == 3) - pxor W, W_TMP1 - pxor W_TMP2, W_TMP1 - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 32-79 - * - * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken - */ -.macro W_PRECALC_32_79_SSSE3 - .if ((i & 3) == 0) - movdqa W_minus_04, W_TMP1 - pxor W_minus_28, W # W is W_minus_32 before xor - palignr $8, W_minus_08, W_TMP1 - .elseif ((i & 3) == 1) - pxor W_minus_16, W - pxor W_TMP1, W - movdqa W, W_TMP1 - .elseif ((i & 3) == 2) - psrld $30, W - pslld $2, W_TMP1 - por W, W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_SSSE3 - - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.section .rodata -.align 16 - -K_XMM_AR: - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - - -.section .text - -W_PRECALC_SSSE3 -.macro xmm_mov a, b - movdqu \a,\b -.endm - -/* - * SSSE3 optimized implementation: - * - * extern "C" void sha1_transform_ssse3(struct sha1_state *state, - * const u8 *data, int blocks); - * - * Note that struct sha1_state is assumed to begin with u32 state[5]. - */ -SHA1_VECTOR_ASM sha1_transform_ssse3 - -.macro W_PRECALC_AVX - -.purgem W_PRECALC_00_15 -.macro W_PRECALC_00_15 - W_PRECALC_00_15_AVX -.endm -.purgem W_PRECALC_16_31 -.macro W_PRECALC_16_31 - W_PRECALC_16_31_AVX -.endm -.purgem W_PRECALC_32_79 -.macro W_PRECALC_32_79 - W_PRECALC_32_79_AVX -.endm - -.macro W_PRECALC_00_15_AVX - .if ((i & 3) == 0) - vmovdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - vpshufb XMM_SHUFB_BSWAP, W_TMP1, W - .elseif ((i & 3) == 2) - vpaddd (K_BASE), W, W_TMP1 - .elseif ((i & 3) == 3) - vmovdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_16_31_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] - vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] - vpxor W_minus_08, W, W - vpxor W_minus_16, W_TMP1, W_TMP1 - .elseif ((i & 3) == 1) - vpxor W_TMP1, W, W - vpslldq $12, W, W_TMP2 - vpslld $1, W, W_TMP1 - .elseif ((i & 3) == 2) - vpsrld $31, W, W - vpor W, W_TMP1, W_TMP1 - vpslld $2, W_TMP2, W - vpsrld $30, W_TMP2, W_TMP2 - .elseif ((i & 3) == 3) - vpxor W, W_TMP1, W_TMP1 - vpxor W_TMP2, W_TMP1, W - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_32_79_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_08, W_minus_04, W_TMP1 - vpxor W_minus_28, W, W # W is W_minus_32 before xor - .elseif ((i & 3) == 1) - vpxor W_minus_16, W_TMP1, W_TMP1 - vpxor W_TMP1, W, W - .elseif ((i & 3) == 2) - vpslld $2, W, W_TMP1 - vpsrld $30, W, W - vpor W, W_TMP1, W - .elseif ((i & 3) == 3) - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_AVX - -W_PRECALC_AVX -.purgem xmm_mov -.macro xmm_mov a, b - vmovdqu \a,\b -.endm - - -/* AVX optimized implementation: - * extern "C" void sha1_transform_avx(struct sha1_state *state, - * const u8 *data, int blocks); - */ -SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c deleted file mode 100644 index 0a912bfc86c5..000000000000 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * Glue code for the SHA1 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha1_generic.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * Copyright (c) Mathias Krause <minipli@googlemail.com> - * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <asm/cpu_device_id.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> - -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static inline int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) -{ - int remain; - - /* - * Make sure struct sha1_state begins directly with the SHA1 - * 160-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - kernel_fpu_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return remain; -} - -static inline int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, - sha1_block_fn *sha1_xform) -{ - kernel_fpu_begin(); - sha1_base_do_finup(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return sha1_base_finish(desc, out); -} - -asmlinkage void sha1_transform_ssse3(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update(desc, data, len, sha1_transform_ssse3); -} - -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_ssse3); -} - -static struct shash_alg sha1_ssse3_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ssse3_update, - .finup = sha1_ssse3_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shash(&sha1_ssse3_alg); - return 0; -} - -static void unregister_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shash(&sha1_ssse3_alg); -} - -asmlinkage void sha1_transform_avx(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update(desc, data, len, sha1_transform_avx); -} - -static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_avx); -} - -static struct shash_alg sha1_avx_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx_update, - .finup = sha1_avx_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha1_avx(void) -{ - if (avx_usable()) - return crypto_register_shash(&sha1_avx_alg); - return 0; -} - -static void unregister_sha1_avx(void) -{ - if (avx_usable()) - crypto_unregister_shash(&sha1_avx_alg); -} - -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks); - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) - && boot_cpu_has(X86_FEATURE_BMI1) - && boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static inline void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) -{ - /* Select the optimal transform based on data block size */ - if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(state, data, blocks); - else - sha1_transform_avx(state, data, blocks); -} - -static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update(desc, data, len, sha1_apply_transform_avx2); -} - -static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); -} - -static struct shash_alg sha1_avx2_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx2_update, - .finup = sha1_avx2_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shash(&sha1_avx2_alg); - return 0; -} - -static void unregister_sha1_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shash(&sha1_avx2_alg); -} - -asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, - int rounds); - -static int sha1_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update(desc, data, len, sha1_ni_transform); -} - -static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_ni_transform); -} - -static struct shash_alg sha1_ni_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ni_update, - .finup = sha1_ni_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ni", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shash(&sha1_ni_alg); - return 0; -} - -static void unregister_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shash(&sha1_ni_alg); -} - -static int __init sha1_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha1_ssse3()) - goto fail; - - if (register_sha1_avx()) { - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_avx2()) { - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_ni()) { - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha1_ssse3_mod_fini(void) -{ - unregister_sha1_ni(); - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); -} - -module_init(sha1_ssse3_mod_init); -module_exit(sha1_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-ssse3"); -MODULE_ALIAS_CRYPTO("sha1-avx"); -MODULE_ALIAS_CRYPTO("sha1-avx2"); -MODULE_ALIAS_CRYPTO("sha1-ni"); diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S deleted file mode 100644 index 5bfce4b045fd..000000000000 --- a/arch/x86/crypto/sha512-avx-asm.S +++ /dev/null @@ -1,423 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with AVX instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# David Cote <david.m.cote@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -.text - -# Virtual Registers -# ARG1 -digest = %rdi -# ARG2 -msg = %rsi -# ARG3 -msglen = %rdx -T1 = %rcx -T2 = %r8 -a_64 = %r9 -b_64 = %r10 -c_64 = %r11 -d_64 = %r12 -e_64 = %r13 -f_64 = %r14 -g_64 = %r15 -h_64 = %rbx -tmp0 = %rax - -# Local variables (stack frame) - -# Message Schedule -W_SIZE = 80*8 -# W[t] + K[t] | W[t+1] + K[t+1] -WK_SIZE = 2*8 - -frame_W = 0 -frame_WK = frame_W + W_SIZE -frame_size = frame_WK + WK_SIZE - -# Useful QWORD "arrays" for simpler memory references -# MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even - -# Input message (arg1) -#define MSG(i) 8*i(msg) - -# Output Digest (arg2) -#define DIGEST(i) 8*i(digest) - -# SHA Constants (static mem) -#define K_t(i) 8*i+K512(%rip) - -# Message Schedule (stack frame) -#define W_t(i) 8*i+frame_W(%rsp) - -# W[t]+K[t] (stack frame) -#define WK_2(i) 8*((i%2))+frame_WK(%rsp) - -.macro RotateState - # Rotate symbols a..h right - TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = TMP -.endm - -.macro RORQ p1 p2 - # shld is faster than ror on Sandybridge - shld $(64-\p2), \p1, \p1 -.endm - -.macro SHA512_Round rnd - # Compute Round %%t - mov f_64, T1 # T1 = f - mov e_64, tmp0 # tmp = e - xor g_64, T1 # T1 = f ^ g - RORQ tmp0, 23 # 41 # tmp = e ror 23 - and e_64, T1 # T1 = (f ^ g) & e - xor e_64, tmp0 # tmp = (e ror 23) ^ e - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) - idx = \rnd - add WK_2(idx), T1 # W[t] + K[t] from message scheduler - RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e - mov a_64, T2 # T2 = a - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h - RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) - mov a_64, tmp0 # tmp = a - xor c_64, T2 # T2 = a ^ c - and c_64, tmp0 # tmp = a & c - and b_64, T2 # T2 = (a ^ c) & b - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) - mov a_64, tmp0 # tmp = a - RORQ tmp0, 5 # 39 # tmp = a ror 5 - xor a_64, tmp0 # tmp = (a ror 5) ^ a - add T1, d_64 # e(next_state) = d + T1 - RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) - RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) - RotateState -.endm - -.macro SHA512_2Sched_2Round_avx rnd - # Compute rounds t-2 and t-1 - # Compute message schedule QWORDS t and t+1 - - # Two rounds are computed based on the values for K[t-2]+W[t-2] and - # K[t-1]+W[t-1] which were previously stored at WK_2 by the message - # scheduler. - # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. - # They are then added to their respective SHA512 constants at - # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] - # For brievity, the comments following vectored instructions only refer to - # the first of a pair of QWORDS. - # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} - # The computation of the message schedule and the rounds are tightly - # stitched to take advantage of instruction-level parallelism. - - idx = \rnd - 2 - vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] - idx = \rnd - 15 - vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] - mov f_64, T1 - vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 - mov e_64, tmp0 - vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 - xor g_64, T1 - RORQ tmp0, 23 # 41 - vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 - and e_64, T1 - xor e_64, tmp0 - vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 - xor g_64, T1 - idx = \rnd - add WK_2(idx), T1# - vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 - RORQ tmp0, 4 # 18 - vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 - xor e_64, tmp0 - mov a_64, T2 - add h_64, T1 - vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 - RORQ tmp0, 14 # 14 - add tmp0, T1 - vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 - mov a_64, tmp0 - xor c_64, T2 - vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 - and c_64, tmp0 - and b_64, T2 - vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 - xor tmp0, T2 - mov a_64, tmp0 - vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 - RORQ tmp0, 5 # 39 - vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 - xor a_64, tmp0 - add T1, d_64 - RORQ tmp0, 6 # 34 - xor a_64, tmp0 - vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ - # W[t-15]>>7 ^ W[t-15]<<63 - lea (T1, T2), h_64 - RORQ tmp0, 28 # 28 - vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 - add tmp0, h_64 - RotateState - vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ - # W[t-2]<<25 - mov f_64, T1 - vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) - mov e_64, tmp0 - xor g_64, T1 - idx = \rnd - 16 - vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] - idx = \rnd - 7 - vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] - RORQ tmp0, 23 # 41 - and e_64, T1 - xor e_64, tmp0 - xor g_64, T1 - vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 - idx = \rnd + 1 - add WK_2(idx), T1 - vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) - RORQ tmp0, 4 # 18 - vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) - xor e_64, tmp0 - vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + - # s0(W[t-15]) + W[t-16] - mov a_64, T2 - add h_64, T1 - RORQ tmp0, 14 # 14 - add tmp0, T1 - idx = \rnd - vmovdqa %xmm0, W_t(idx) # Store W[t] - vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] - vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds - mov a_64, tmp0 - xor c_64, T2 - and c_64, tmp0 - and b_64, T2 - xor tmp0, T2 - mov a_64, tmp0 - RORQ tmp0, 5 # 39 - xor a_64, tmp0 - add T1, d_64 - RORQ tmp0, 6 # 34 - xor a_64, tmp0 - lea (T1, T2), h_64 - RORQ tmp0, 28 # 28 - add tmp0, h_64 - RotateState -.endm - -######################################################################## -# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_avx) - test msglen, msglen - je .Lnowork - - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - -.Lupdateblock: - - # Load state variables - mov DIGEST(0), a_64 - mov DIGEST(1), b_64 - mov DIGEST(2), c_64 - mov DIGEST(3), d_64 - mov DIGEST(4), e_64 - mov DIGEST(5), f_64 - mov DIGEST(6), g_64 - mov DIGEST(7), h_64 - - t = 0 - .rept 80/2 + 1 - # (80 rounds) / (2 rounds/iteration) + (1 iteration) - # +1 iteration because the scheduler leads hashing by 1 iteration - .if t < 2 - # BSWAP 2 QWORDS - vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 - vmovdqu MSG(t), %xmm0 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] - vmovdqa %xmm0, WK_2(t) # Store into WK for rounds - .elseif t < 16 - # BSWAP 2 QWORDS# Compute 2 Rounds - vmovdqu MSG(t), %xmm0 - vpshufb %xmm1, %xmm0, %xmm0 # BSWAP - SHA512_Round t-2 # Round t-2 - vmovdqa %xmm0, W_t(t) # Store Scheduled Pair - vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] - SHA512_Round t-1 # Round t-1 - vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK - .elseif t < 79 - # Schedule 2 QWORDS# Compute 2 Rounds - SHA512_2Sched_2Round_avx t - .else - # Compute 2 Rounds - SHA512_Round t-2 - SHA512_Round t-1 - .endif - t = t+2 - .endr - - # Update digest - add a_64, DIGEST(0) - add b_64, DIGEST(1) - add c_64, DIGEST(2) - add d_64, DIGEST(3) - add e_64, DIGEST(4) - add f_64, DIGEST(5) - add g_64, DIGEST(6) - add h_64, DIGEST(7) - - # Advance to next message block - add $16*8, msg - dec msglen - jnz .Lupdateblock - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - -.Lnowork: - RET -SYM_FUNC_END(sha512_transform_avx) - -######################################################################## -### Binary Data - -.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 -.align 16 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -XMM_QWORD_BSWAP: - .octa 0x08090a0b0c0d0e0f0001020304050607 - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S deleted file mode 100644 index 24973f42c43f..000000000000 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ /dev/null @@ -1,750 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# David Cote <david.m.cote@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 blocks at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -.text - -# Virtual Registers -Y_0 = %ymm4 -Y_1 = %ymm5 -Y_2 = %ymm6 -Y_3 = %ymm7 - -YTMP0 = %ymm0 -YTMP1 = %ymm1 -YTMP2 = %ymm2 -YTMP3 = %ymm3 -YTMP4 = %ymm8 -XFER = YTMP0 - -BYTE_FLIP_MASK = %ymm9 - -# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 -CTX1 = %rdi -CTX2 = %r12 -# 2nd arg -INP = %rsi -# 3rd arg -NUM_BLKS = %rdx - -c = %rcx -d = %r8 -e = %rdx -y3 = %rsi - -TBL = %rdi # clobbers CTX1 - -a = %rax -b = %rbx - -f = %r9 -g = %r10 -h = %r11 -old_h = %r11 - -T1 = %r12 # clobbers CTX2 -y0 = %r13 -y1 = %r14 -y2 = %r15 - -# Local variables (stack frame) -XFER_SIZE = 4*8 -SRND_SIZE = 1*8 -INP_SIZE = 1*8 -INPEND_SIZE = 1*8 -CTX_SIZE = 1*8 - -frame_XFER = 0 -frame_SRND = frame_XFER + XFER_SIZE -frame_INP = frame_SRND + SRND_SIZE -frame_INPEND = frame_INP + INP_SIZE -frame_CTX = frame_INPEND + INPEND_SIZE -frame_size = frame_CTX + CTX_SIZE - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask -# Load ymm with mem and byte swap each dword -.macro COPY_YMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm -# rotate_Ys -# Rotate values of symbols Y0...Y3 -.macro rotate_Ys - Y_ = Y_0 - Y_0 = Y_1 - Y_1 = Y_2 - Y_2 = Y_3 - Y_3 = Y_ -.endm - -# RotateState -.macro RotateState - # Rotate symbols a..h right - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL -# YDST = {YSRC1, YSRC2} >> RVAL*8 -.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL - vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} - vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 -.endm - -.macro FOUR_ROUNDS_AND_SCHED -################################### RND N + 0 ######################################### - - # Extract w[t-7] - MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] - # Calculate w[t-16] + w[t-7] - vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] - # Extract w[t-15] - MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] - - # Calculate sigma0 - - # Calculate w[t-15] ror 1 - vpsrlq $1, YTMP1, YTMP2 - vpsllq $(64-1), YTMP1, YTMP3 - vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 - # Calculate w[t-15] shr 7 - vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add frame_XFER(%rsp),h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - rorx $14, e, y1 # y1 = (e >> 14) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -################################### RND N + 1 ######################################### - - # Calculate w[t-15] ror 8 - vpsrlq $8, YTMP1, YTMP2 - vpsllq $(64-8), YTMP1, YTMP1 - vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 - # XOR the three components - vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 - vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 - - - # Add three components, w[t-16], w[t-7] and sigma0 - vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 - # Move to appropriate lanes for calculating w[16] and w[17] - vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} - # Move to appropriate lanes for calculating w[18] and w[19] - vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} - - # Calculate w[16] and w[17] in both 128 bit lanes - - # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes - vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} - vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} - - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - - rorx $28, a, T1 # T1 = (a >> 28) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - - -################################### RND N + 2 ######################################### - - vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} - vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} - vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} - vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ - # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} - - # Add sigma1 to the other compunents to get w[16] and w[17] - vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} - - # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane - vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- - - rorx $18, e, y1 # y1 = e >> 18 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - and e, y2 # y2 = (f^g)&e # CH - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $39, a, y1 # y1 = a >> 39 # S0A - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -################################### RND N + 3 ######################################### - - vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} - vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} - vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} - vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} - vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} - vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ - # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} - - # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] - # to newly calculated sigma1 to get w[18] and w[19] - vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} - - # Form w[19, w[18], w17], w[16] - vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} - - mov a, y3 # y3 = a # MAJA - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - mov f, y2 # y2 = f # CH - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - rorx $39, a, y1 # y1 = a >> 39 # S0A - add y0, y2 # y2 = S1 + CH # -- - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $28, a, T1 # T1 = (a >> 28) # S0 - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - - rotate_Ys -.endm - -.macro DO_4ROUNDS - -################################### RND N + 0 ######################################### - - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 1 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 2 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - RotateState - -################################### RND N + 3 ######################################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $41, e, y0 # y0 = e >> 41 # S1A - rorx $18, e, y1 # y1 = e >> 18 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 - rorx $14, e, y1 # y1 = (e >> 14) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 - rorx $34, a, T1 # T1 = a >> 34 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $39, a, y1 # y1 = a >> 39 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 - rorx $28, a, T1 # T1 = (a >> 28) # S0 - add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - RotateState - -.endm - -######################################################################## -# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_rorx) - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - - shl $7, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, frame_INPEND(%rsp) - - ## load initial digest - mov 8*0(CTX1), a - mov 8*1(CTX1), b - mov 8*2(CTX1), c - mov 8*3(CTX1), d - mov 8*4(CTX1), e - mov 8*5(CTX1), f - mov 8*6(CTX1), g - mov 8*7(CTX1), h - - # save %rdi (CTX) before it gets clobbered - mov %rdi, frame_CTX(%rsp) - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - -.Lloop0: - lea K512(%rip), TBL - - ## byte swap first 16 dwords - COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK - COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK - - mov INP, frame_INP(%rsp) - - ## schedule 64 input dwords, by doing 12 rounds of 4 each - movq $4, frame_SRND(%rsp) - -.align 16 -.Lloop1: - vpaddq (TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 1*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 2*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddq 3*32(TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - add $(4*32), TBL - FOUR_ROUNDS_AND_SCHED - - subq $1, frame_SRND(%rsp) - jne .Lloop1 - - movq $2, frame_SRND(%rsp) -.Lloop2: - vpaddq (TBL), Y_0, XFER - vmovdqa XFER, frame_XFER(%rsp) - DO_4ROUNDS - vpaddq 1*32(TBL), Y_1, XFER - vmovdqa XFER, frame_XFER(%rsp) - add $(2*32), TBL - DO_4ROUNDS - - vmovdqa Y_2, Y_0 - vmovdqa Y_3, Y_1 - - subq $1, frame_SRND(%rsp) - jne .Lloop2 - - mov frame_CTX(%rsp), CTX2 - addm 8*0(CTX2), a - addm 8*1(CTX2), b - addm 8*2(CTX2), c - addm 8*3(CTX2), d - addm 8*4(CTX2), e - addm 8*5(CTX2), f - addm 8*6(CTX2), g - addm 8*7(CTX2), h - - mov frame_INP(%rsp), INP - add $128, INP - cmp frame_INPEND(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - vzeroupper - RET -SYM_FUNC_END(sha512_transform_rorx) - -######################################################################## -### Binary Data - - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x08090a0b0c0d0e0f0001020304050607 - .octa 0x18191a1b1c1d1e1f1011121314151617 - -.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 -.align 32 -MASK_YMM_LO: - .octa 0x00000000000000000000000000000000 - .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S deleted file mode 100644 index 30a2c4777f9d..000000000000 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ /dev/null @@ -1,425 +0,0 @@ -######################################################################## -# Implement fast SHA-512 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# David Cote <david.m.cote@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-512 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -.text - -# Virtual Registers -# ARG1 -digest = %rdi -# ARG2 -msg = %rsi -# ARG3 -msglen = %rdx -T1 = %rcx -T2 = %r8 -a_64 = %r9 -b_64 = %r10 -c_64 = %r11 -d_64 = %r12 -e_64 = %r13 -f_64 = %r14 -g_64 = %r15 -h_64 = %rbx -tmp0 = %rax - -# Local variables (stack frame) - -W_SIZE = 80*8 -WK_SIZE = 2*8 - -frame_W = 0 -frame_WK = frame_W + W_SIZE -frame_size = frame_WK + WK_SIZE - -# Useful QWORD "arrays" for simpler memory references -# MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even - -# Input message (arg1) -#define MSG(i) 8*i(msg) - -# Output Digest (arg2) -#define DIGEST(i) 8*i(digest) - -# SHA Constants (static mem) -#define K_t(i) 8*i+K512(%rip) - -# Message Schedule (stack frame) -#define W_t(i) 8*i+frame_W(%rsp) - -# W[t]+K[t] (stack frame) -#define WK_2(i) 8*((i%2))+frame_WK(%rsp) - -.macro RotateState - # Rotate symbols a..h right - TMP = h_64 - h_64 = g_64 - g_64 = f_64 - f_64 = e_64 - e_64 = d_64 - d_64 = c_64 - c_64 = b_64 - b_64 = a_64 - a_64 = TMP -.endm - -.macro SHA512_Round rnd - - # Compute Round %%t - mov f_64, T1 # T1 = f - mov e_64, tmp0 # tmp = e - xor g_64, T1 # T1 = f ^ g - ror $23, tmp0 # 41 # tmp = e ror 23 - and e_64, T1 # T1 = (f ^ g) & e - xor e_64, tmp0 # tmp = (e ror 23) ^ e - xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) - idx = \rnd - add WK_2(idx), T1 # W[t] + K[t] from message scheduler - ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 - xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e - mov a_64, T2 # T2 = a - add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h - ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) - add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) - mov a_64, tmp0 # tmp = a - xor c_64, T2 # T2 = a ^ c - and c_64, tmp0 # tmp = a & c - and b_64, T2 # T2 = (a ^ c) & b - xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) - mov a_64, tmp0 # tmp = a - ror $5, tmp0 # 39 # tmp = a ror 5 - xor a_64, tmp0 # tmp = (a ror 5) ^ a - add T1, d_64 # e(next_state) = d + T1 - ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 - xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a - lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) - ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) - add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) - RotateState -.endm - -.macro SHA512_2Sched_2Round_sse rnd - - # Compute rounds t-2 and t-1 - # Compute message schedule QWORDS t and t+1 - - # Two rounds are computed based on the values for K[t-2]+W[t-2] and - # K[t-1]+W[t-1] which were previously stored at WK_2 by the message - # scheduler. - # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. - # They are then added to their respective SHA512 constants at - # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] - # For brievity, the comments following vectored instructions only refer to - # the first of a pair of QWORDS. - # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} - # The computation of the message schedule and the rounds are tightly - # stitched to take advantage of instruction-level parallelism. - # For clarity, integer instructions (for the rounds calculation) are indented - # by one tab. Vectored instructions (for the message scheduler) are indented - # by two tabs. - - mov f_64, T1 - idx = \rnd -2 - movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] - xor g_64, T1 - and e_64, T1 - movdqa %xmm2, %xmm0 # XMM0 = W[t-2] - xor g_64, T1 - idx = \rnd - add WK_2(idx), T1 - idx = \rnd - 15 - movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] - mov e_64, tmp0 - ror $23, tmp0 # 41 - movdqa %xmm5, %xmm3 # XMM3 = W[t-15] - xor e_64, tmp0 - ror $4, tmp0 # 18 - psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 - xor e_64, tmp0 - ror $14, tmp0 # 14 - psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 - add tmp0, T1 - add h_64, T1 - pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] - mov a_64, T2 - xor c_64, T2 - pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] - and b_64, T2 - mov a_64, tmp0 - psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 - and c_64, tmp0 - xor tmp0, T2 - psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 - mov a_64, tmp0 - ror $5, tmp0 # 39 - pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] - xor a_64, tmp0 - ror $6, tmp0 # 34 - pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] - xor a_64, tmp0 - ror $28, tmp0 # 28 - psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 - add tmp0, T2 - add T1, d_64 - psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 - lea (T1, T2), h_64 - RotateState - movdqa %xmm2, %xmm1 # XMM1 = W[t-2] - mov f_64, T1 - xor g_64, T1 - movdqa %xmm5, %xmm4 # XMM4 = W[t-15] - and e_64, T1 - xor g_64, T1 - psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 - idx = \rnd + 1 - add WK_2(idx), T1 - mov e_64, tmp0 - psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 - ror $23, tmp0 # 41 - xor e_64, tmp0 - pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] - ror $4, tmp0 # 18 - xor e_64, tmp0 - pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] - ror $14, tmp0 # 14 - add tmp0, T1 - psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 - add h_64, T1 - mov a_64, T2 - psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 - xor c_64, T2 - and b_64, T2 - pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) - mov a_64, tmp0 - and c_64, tmp0 - idx = \rnd - 7 - movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] - xor tmp0, T2 - pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) - mov a_64, tmp0 - paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) - ror $5, tmp0 # 39 - idx =\rnd-16 - paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] - xor a_64, tmp0 - paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] - ror $6, tmp0 # 34 - movdqa %xmm0, W_t(\rnd) # Store scheduled qwords - xor a_64, tmp0 - paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] - ror $28, tmp0 # 28 - idx = \rnd - movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds - add tmp0, T2 - add T1, d_64 - lea (T1, T2), h_64 - RotateState -.endm - -######################################################################## -## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, -## int blocks); -# (struct sha512_state is assumed to begin with u64 state[8]) -# Purpose: Updates the SHA512 digest stored at "state" with the message -# stored in "data". -# The size of the message pointed to by "data" must be an integer multiple -# of SHA512 message blocks. -# "blocks" is the message length in SHA512 blocks. -######################################################################## -SYM_TYPED_FUNC_START(sha512_transform_ssse3) - - test msglen, msglen - je .Lnowork - - # Save GPRs - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - # Allocate Stack Space - push %rbp - mov %rsp, %rbp - sub $frame_size, %rsp - and $~(0x20 - 1), %rsp - -.Lupdateblock: - -# Load state variables - mov DIGEST(0), a_64 - mov DIGEST(1), b_64 - mov DIGEST(2), c_64 - mov DIGEST(3), d_64 - mov DIGEST(4), e_64 - mov DIGEST(5), f_64 - mov DIGEST(6), g_64 - mov DIGEST(7), h_64 - - t = 0 - .rept 80/2 + 1 - # (80 rounds) / (2 rounds/iteration) + (1 iteration) - # +1 iteration because the scheduler leads hashing by 1 iteration - .if t < 2 - # BSWAP 2 QWORDS - movdqa XMM_QWORD_BSWAP(%rip), %xmm1 - movdqu MSG(t), %xmm0 - pshufb %xmm1, %xmm0 # BSWAP - movdqa %xmm0, W_t(t) # Store Scheduled Pair - paddq K_t(t), %xmm0 # Compute W[t]+K[t] - movdqa %xmm0, WK_2(t) # Store into WK for rounds - .elseif t < 16 - # BSWAP 2 QWORDS# Compute 2 Rounds - movdqu MSG(t), %xmm0 - pshufb %xmm1, %xmm0 # BSWAP - SHA512_Round t-2 # Round t-2 - movdqa %xmm0, W_t(t) # Store Scheduled Pair - paddq K_t(t), %xmm0 # Compute W[t]+K[t] - SHA512_Round t-1 # Round t-1 - movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK - .elseif t < 79 - # Schedule 2 QWORDS# Compute 2 Rounds - SHA512_2Sched_2Round_sse t - .else - # Compute 2 Rounds - SHA512_Round t-2 - SHA512_Round t-1 - .endif - t = t+2 - .endr - - # Update digest - add a_64, DIGEST(0) - add b_64, DIGEST(1) - add c_64, DIGEST(2) - add d_64, DIGEST(3) - add e_64, DIGEST(4) - add f_64, DIGEST(5) - add g_64, DIGEST(6) - add h_64, DIGEST(7) - - # Advance to next message block - add $16*8, msg - dec msglen - jnz .Lupdateblock - - # Restore Stack Pointer - mov %rbp, %rsp - pop %rbp - - # Restore GPRs - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - -.Lnowork: - RET -SYM_FUNC_END(sha512_transform_ssse3) - -######################################################################## -### Binary Data - -.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 -.align 16 -# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. -XMM_QWORD_BSWAP: - .octa 0x08090a0b0c0d0e0f0001020304050607 - -# Mergeable 640-byte rodata section. This allows linker to merge the table -# with other, exactly the same 640-byte fragment of another rodata section -# (if such section exists). -.section .rodata.cst640.K512, "aM", @progbits, 640 -.align 64 -# K[t] used in SHA512 hashing -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c deleted file mode 100644 index 067684c54395..000000000000 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Cryptographic API. - * - * Glue code for the SHA512 Secure Hash Algorithm assembler - * implementation using supplemental SSE3 / AVX / AVX2 instructions. - * - * This file is based on sha512_generic.c - * - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <asm/cpu_device_id.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <crypto/sha2.h> -#include <crypto/sha512_base.h> - -asmlinkage void sha512_transform_ssse3(struct sha512_state *state, - const u8 *data, int blocks); - -static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha512_block_fn *sha512_xform) -{ - int remain; - - /* - * Make sure struct sha512_state begins directly with the SHA512 - * 512-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); - - kernel_fpu_begin(); - remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); - kernel_fpu_end(); - - return remain; -} - -static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha512_block_fn *sha512_xform) -{ - kernel_fpu_begin(); - sha512_base_do_finup(desc, data, len, sha512_xform); - kernel_fpu_end(); - - return sha512_base_finish(desc, out); -} - -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update(desc, data, len, sha512_transform_ssse3); -} - -static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_ssse3); -} - -static struct shash_alg sha512_ssse3_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_ssse3_update, - .finup = sha512_ssse3_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_ssse3_update, - .finup = sha512_ssse3_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha512_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(sha512_ssse3_algs, - ARRAY_SIZE(sha512_ssse3_algs)); - return 0; -} - -static void unregister_sha512_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(sha512_ssse3_algs, - ARRAY_SIZE(sha512_ssse3_algs)); -} - -asmlinkage void sha512_transform_avx(struct sha512_state *state, - const u8 *data, int blocks); -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int sha512_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update(desc, data, len, sha512_transform_avx); -} - -static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_avx); -} - -static struct shash_alg sha512_avx_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_avx_update, - .finup = sha512_avx_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_avx_update, - .finup = sha512_avx_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha512_avx(void) -{ - if (avx_usable()) - return crypto_register_shashes(sha512_avx_algs, - ARRAY_SIZE(sha512_avx_algs)); - return 0; -} - -static void unregister_sha512_avx(void) -{ - if (avx_usable()) - crypto_unregister_shashes(sha512_avx_algs, - ARRAY_SIZE(sha512_avx_algs)); -} - -asmlinkage void sha512_transform_rorx(struct sha512_state *state, - const u8 *data, int blocks); - -static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_update(desc, data, len, sha512_transform_rorx); -} - -static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha512_finup(desc, data, len, out, sha512_transform_rorx); -} - -static struct shash_alg sha512_avx2_algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_avx2_update, - .finup = sha512_avx2_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_avx2_update, - .finup = sha512_avx2_finup, - .descsize = SHA512_STATE_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static int register_sha512_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shashes(sha512_avx2_algs, - ARRAY_SIZE(sha512_avx2_algs)); - return 0; -} -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static void unregister_sha512_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shashes(sha512_avx2_algs, - ARRAY_SIZE(sha512_avx2_algs)); -} - -static int __init sha512_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha512_ssse3()) - goto fail; - - if (register_sha512_avx()) { - unregister_sha512_ssse3(); - goto fail; - } - - if (register_sha512_avx2()) { - unregister_sha512_avx(); - unregister_sha512_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha512_ssse3_mod_fini(void) -{ - unregister_sha512_avx2(); - unregister_sha512_avx(); - unregister_sha512_ssse3(); -} - -module_init(sha512_ssse3_mod_init); -module_exit(sha512_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha512-ssse3"); -MODULE_ALIAS_CRYPTO("sha512-avx"); -MODULE_ALIAS_CRYPTO("sha512-avx2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha384-ssse3"); -MODULE_ALIAS_CRYPTO("sha384-avx"); -MODULE_ALIAS_CRYPTO("sha384-avx2"); diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index 72867fc49ce8..88caf418a06f 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -11,6 +11,7 @@ #include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> +#include <linux/export.h> #include <linux/kernel.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 4c67184dc573..8e9906d36902 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -40,6 +40,7 @@ #include <crypto/algapi.h> #include <crypto/twofish.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 1a1ecfa7f72a..8ad77725bf60 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -9,6 +9,7 @@ #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/crypto.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d83236b96f22..94519688b007 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -369,7 +369,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro STACKLEAK_ERASE_NOCLOBBER -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE PUSH_AND_CLEAR_REGS call stackleak_erase POP_REGS @@ -388,7 +388,7 @@ For 32-bit we have the following conventions - kernel is built with #endif /* !CONFIG_X86_64 */ .macro STACKLEAK_ERASE -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE call stackleak_erase #endif .endm diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 175958b02f2b..8e9a0cc20a4a 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -36,20 +36,20 @@ EXPORT_SYMBOL_GPL(write_ibpb); /* * Define the VERW operand that is disguised as entry code so that - * it can be referenced with KPTI enabled. This ensure VERW can be + * it can be referenced with KPTI enabled. This ensures VERW can be * used late in exit-to-user path after page tables are switched. */ .pushsection .entry.text, "ax" .align L1_CACHE_BYTES, 0xcc -SYM_CODE_START_NOALIGN(mds_verw_sel) +SYM_CODE_START_NOALIGN(x86_verw_sel) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR .word __KERNEL_DS .align L1_CACHE_BYTES, 0xcc -SYM_CODE_END(mds_verw_sel); +SYM_CODE_END(x86_verw_sel); /* For KVM */ -EXPORT_SYMBOL_GPL(mds_verw_sel); +EXPORT_SYMBOL_GPL(x86_verw_sel); .popsection diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ac007ea00979..4877e16da69a 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -473,3 +473,5 @@ 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat 467 i386 open_tree_attr sys_open_tree_attr +468 i386 file_getattr sys_file_getattr +469 i386 file_setattr sys_file_setattr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30d..92cf0fe2291e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -391,6 +391,8 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 54d3e9774d62..f247f5f5cb44 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -62,7 +62,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -123,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 466283326630..c2fb729c270e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event) * If the PEBS counters snapshotting is enabled, * the topdown event is available in PEBS records. */ - if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); @@ -2900,6 +2900,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int msr_b, msr_c; + int msr_offset; if (!mask && !cpuc->acr_cfg_b[idx]) return; @@ -2907,19 +2908,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) if (idx < INTEL_PMC_IDX_FIXED) { msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + msr_offset = x86_pmu.addr_offset(idx, false); } else { msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; - idx -= INTEL_PMC_IDX_FIXED; + msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); } if (cpuc->acr_cfg_b[idx] != mask) { - wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + wrmsrl(msr_b + msr_offset, mask); cpuc->acr_cfg_b[idx] = mask; } /* Only need to update the reload value when there is a valid config value. */ if (mask && cpuc->acr_cfg_c[idx] != reload) { - wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + wrmsrl(msr_c + msr_offset, reload); cpuc->acr_cfg_c[idx] = reload; } } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index e0815a12db90..a762f7f5b161 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1807,6 +1807,12 @@ static const struct intel_uncore_init_fun lnl_uncore_init __initconst = { .mmio_init = lnl_uncore_mmio_init, }; +static const struct intel_uncore_init_fun ptl_uncore_init __initconst = { + .cpu_init = ptl_uncore_cpu_init, + .mmio_init = ptl_uncore_mmio_init, + .use_discovery = true, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1888,6 +1894,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init), diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 3dcb88c0ecfa..d8815fff7588 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -612,10 +612,12 @@ void tgl_uncore_cpu_init(void); void adl_uncore_cpu_init(void); void lnl_uncore_cpu_init(void); void mtl_uncore_cpu_init(void); +void ptl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); void adl_uncore_mmio_init(void); void lnl_uncore_mmio_init(void); +void ptl_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 18a3022f26a0..7d57ce706feb 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -274,32 +274,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore) return false; } -static int parse_discovery_table(struct pci_dev *dev, int die, - u32 bar_offset, bool *parsed, - int *ignore) +static int __parse_discovery_table(resource_size_t addr, int die, + bool *parsed, int *ignore) { struct uncore_global_discovery global; struct uncore_unit_discovery unit; void __iomem *io_addr; - resource_size_t addr; unsigned long size; - u32 val; int i; - pci_read_config_dword(dev, bar_offset, &val); - - if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) - return -EINVAL; - - addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); -#ifdef CONFIG_PHYS_ADDR_T_64BIT - if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { - u32 val2; - - pci_read_config_dword(dev, bar_offset + 4, &val2); - addr |= ((resource_size_t)val2) << 32; - } -#endif size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; io_addr = ioremap(addr, size); if (!io_addr) @@ -342,7 +325,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die, return 0; } -bool intel_uncore_has_discovery_tables(int *ignore) +static int parse_discovery_table(struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed, + int *ignore) +{ + resource_size_t addr; + u32 val; + + pci_read_config_dword(dev, bar_offset, &val); + + if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) + return -EINVAL; + + addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + u32 val2; + + pci_read_config_dword(dev, bar_offset + 4, &val2); + addr |= ((resource_size_t)val2) << 32; + } +#endif + + return __parse_discovery_table(addr, die, parsed, ignore); +} + +static bool intel_uncore_has_discovery_tables_pci(int *ignore) { u32 device, val, entry_id, bar_offset; int die, dvsec = 0, ret = true; @@ -391,6 +399,45 @@ err: return ret; } +static bool intel_uncore_has_discovery_tables_msr(int *ignore) +{ + unsigned long *die_mask; + bool parsed = false; + int cpu, die; + u64 base; + + die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()), + sizeof(unsigned long), GFP_KERNEL); + if (!die_mask) + return false; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + die = topology_logical_die_id(cpu); + if (__test_and_set_bit(die, die_mask)) + continue; + + if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base)) + continue; + + if (!base) + continue; + + __parse_discovery_table(base, die, &parsed, ignore); + } + + cpus_read_unlock(); + + kfree(die_mask); + return parsed; +} + +bool intel_uncore_has_discovery_tables(int *ignore) +{ + return intel_uncore_has_discovery_tables_msr(ignore) || + intel_uncore_has_discovery_tables_pci(ignore); +} + void intel_uncore_clear_discovery_tables(void) { struct intel_uncore_discovery_type *type, *next; @@ -604,7 +651,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) } addr = unit->addr; - box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) { pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", type->type_id, unit->id, (unsigned long long)addr); diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 0e94aa7db8e7..dff75c98e22f 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +/* Store the full address of the global discovery table */ +#define UNCORE_DISCOVERY_MSR 0x201e + /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 /* Capability ID for a discovery table device */ @@ -168,3 +171,7 @@ bool intel_generic_uncore_assign_hw_event(struct perf_event *event, struct intel_uncore_box *box); void uncore_find_add_unit(struct intel_uncore_discovery_unit *node, struct rb_root *root, u16 *num_units); +struct intel_uncore_type ** +uncore_get_uncores(enum uncore_access_type type_id, int num_extra, + struct intel_uncore_type **extra, int max_num_types, + struct intel_uncore_type **uncores); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index a1a96833e30e..807e582b8f17 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1855,3 +1855,82 @@ void lnl_uncore_mmio_init(void) } /* end of Lunar Lake MMIO uncore support */ + +/* Panther Lake uncore support */ + +#define UNCORE_PTL_MAX_NUM_UNCORE_TYPES 42 +#define UNCORE_PTL_TYPE_IMC 6 +#define UNCORE_PTL_TYPE_SNCU 34 +#define UNCORE_PTL_TYPE_HBO 41 + +#define PTL_UNCORE_GLOBAL_CTL_OFFSET 0x380 + +static struct intel_uncore_type ptl_uncore_imc = { + .name = "imc", + .mmio_map_size = 0xf00, +}; + +static void ptl_uncore_sncu_init_box(struct intel_uncore_box *box) +{ + intel_generic_uncore_mmio_init_box(box); + + /* Clear the global freeze bit */ + if (box->io_addr) + writel(0, box->io_addr + PTL_UNCORE_GLOBAL_CTL_OFFSET); +} + +static struct intel_uncore_ops ptl_uncore_sncu_ops = { + .init_box = ptl_uncore_sncu_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = intel_generic_uncore_mmio_disable_box, + .enable_box = intel_generic_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static struct intel_uncore_type ptl_uncore_sncu = { + .name = "sncu", + .ops = &ptl_uncore_sncu_ops, + .mmio_map_size = 0xf00, +}; + +static struct intel_uncore_type ptl_uncore_hbo = { + .name = "hbo", + .mmio_map_size = 0xf00, +}; + +static struct intel_uncore_type *ptl_uncores[UNCORE_PTL_MAX_NUM_UNCORE_TYPES] = { + [UNCORE_PTL_TYPE_IMC] = &ptl_uncore_imc, + [UNCORE_PTL_TYPE_SNCU] = &ptl_uncore_sncu, + [UNCORE_PTL_TYPE_HBO] = &ptl_uncore_hbo, +}; + +#define UNCORE_PTL_MMIO_EXTRA_UNCORES 1 + +static struct intel_uncore_type *ptl_mmio_extra_uncores[UNCORE_PTL_MMIO_EXTRA_UNCORES] = { + &adl_uncore_imc_free_running, +}; + +void ptl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, + UNCORE_PTL_MMIO_EXTRA_UNCORES, + ptl_mmio_extra_uncores, + UNCORE_PTL_MAX_NUM_UNCORE_TYPES, + ptl_uncores); +} + +static struct intel_uncore_type *ptl_msr_uncores[] = { + &mtl_uncore_cbox, + NULL +}; + +void ptl_uncore_cpu_init(void) +{ + mtl_uncore_cbox.num_boxes = 6; + mtl_uncore_cbox.ops = &lnl_uncore_msr_ops; + uncore_msr_uncores = ptl_msr_uncores; +} + +/* end of Panther Lake uncore support */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2824dc9950be..e1f370b8d065 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6409,9 +6409,11 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, to_type->get_topology = from_type->get_topology; if (from_type->cleanup_mapping) to_type->cleanup_mapping = from_type->cleanup_mapping; + if (from_type->mmio_map_size) + to_type->mmio_map_size = from_type->mmio_map_size; } -static struct intel_uncore_type ** +struct intel_uncore_type ** uncore_get_uncores(enum uncore_access_type type_id, int num_extra, struct intel_uncore_type **extra, int max_num_types, struct intel_uncore_type **uncores) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 5d27194a2efa..afdbda2dd7b7 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -34,6 +34,7 @@ #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> +#include <linux/export.h> void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); @@ -391,40 +392,6 @@ static void __init hv_stimer_setup_percpu_clockev(void) old_setup_percpu_clockev(); } -#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) -static u8 __init get_vtl(void) -{ - u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; - struct hv_input_get_vp_registers *input; - struct hv_output_get_vp_registers *output; - unsigned long flags; - u64 ret; - - local_irq_save(flags); - input = *this_cpu_ptr(hyperv_pcpu_input_arg); - output = *this_cpu_ptr(hyperv_pcpu_output_arg); - - memset(input, 0, struct_size(input, names, 1)); - input->partition_id = HV_PARTITION_ID_SELF; - input->vp_index = HV_VP_INDEX_SELF; - input->input_vtl.as_uint8 = 0; - input->names[0] = HV_REGISTER_VSM_VP_STATUS; - - ret = hv_do_hypercall(control, input, output); - if (hv_result_success(ret)) { - ret = output->values[0].reg8 & HV_X64_VTL_MASK; - } else { - pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); - BUG(); - } - - local_irq_restore(flags); - return ret; -} -#else -static inline u8 get_vtl(void) { return 0; } -#endif - /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -707,3 +674,36 @@ bool hv_is_hyperv_initialized(void) return hypercall_msr.enable; } EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); + +int hv_apicid_to_vp_index(u32 apic_id) +{ + u64 control; + u64 status; + unsigned long irq_flags; + struct hv_get_vp_from_apic_id_in *input; + u32 *output, ret; + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + input->apic_ids[0] = apic_id; + + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID; + status = hv_do_hypercall(control, input, output); + ret = output[0]; + + local_irq_restore(irq_flags); + + if (!hv_result_success(status)) { + pr_err("failed to get vp index from apic id %d, status %#llx\n", + apic_id, status); + return -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index); diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 4580936dcb03..042e8712d8de 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -56,7 +56,12 @@ static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) void __init hv_vtl_init_platform(void) { - pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + /* + * This function is a no-op if the VTL mode is not enabled. + * If it is, this function runs if and only the kernel boots in + * VTL2 which the x86 hv initialization path makes sure of. + */ + pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl); x86_platform.realmode_reserve = x86_init_noop; x86_platform.realmode_init = x86_init_noop; @@ -207,63 +212,23 @@ free_lock: return ret; } -static int hv_vtl_apicid_to_vp_id(u32 apic_id) -{ - u64 control; - u64 status; - unsigned long irq_flags; - struct hv_get_vp_from_apic_id_in *input; - u32 *output, ret; - - local_irq_save(irq_flags); - - input = *this_cpu_ptr(hyperv_pcpu_input_arg); - memset(input, 0, sizeof(*input)); - input->partition_id = HV_PARTITION_ID_SELF; - input->apic_ids[0] = apic_id; - - output = *this_cpu_ptr(hyperv_pcpu_output_arg); - - control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID; - status = hv_do_hypercall(control, input, output); - ret = output[0]; - - local_irq_restore(irq_flags); - - if (!hv_result_success(status)) { - pr_err("failed to get vp id from apic id %d, status %#llx\n", - apic_id, status); - return -EINVAL; - } - - return ret; -} - -static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) +static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu) { - int vp_id, cpu; - - /* Find the logical CPU for the APIC ID */ - for_each_present_cpu(cpu) { - if (arch_match_cpu_phys_id(cpu, apicid)) - break; - } - if (cpu >= nr_cpu_ids) - return -EINVAL; + int vp_index; pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid); - vp_id = hv_vtl_apicid_to_vp_id(apicid); + vp_index = hv_apicid_to_vp_index(apicid); - if (vp_id < 0) { + if (vp_index < 0) { pr_err("Couldn't find CPU with APIC ID %d\n", apicid); return -EINVAL; } - if (vp_id > ms_hyperv.max_vp_index) { - pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid); + if (vp_index > ms_hyperv.max_vp_index) { + pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid); return -EINVAL; } - return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip); + return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip); } int __init hv_vtl_early_init(void) diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 31f0d29cbc5e..090f5ac9f492 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -10,6 +10,7 @@ #include <linux/pci.h> #include <linux/irq.h> +#include <linux/export.h> #include <asm/mshyperv.h> static int hv_map_interrupt(union hv_device_id device_id, bool level, @@ -46,7 +47,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, if (nr_bank < 0) { local_irq_restore(flags); pr_err("%s: unable to generate VP set\n", __func__); - return EINVAL; + return -EINVAL; } intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; @@ -66,7 +67,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, if (!hv_result_success(status)) hv_status_err(status, "\n"); - return hv_result(status); + return hv_result_to_errno(status); } static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) @@ -88,7 +89,10 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); local_irq_restore(flags); - return hv_result(status); + if (!hv_result_success(status)) + hv_status_err(status, "\n"); + + return hv_result_to_errno(status); } #ifdef CONFIG_PCI_MSI @@ -169,13 +173,34 @@ static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) return dev_id; } -static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector, - struct hv_interrupt_entry *entry) +/** + * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor. + * @data: Describes the IRQ + * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL) + * + * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall. + * + * Return: 0 on success, -errno on failure + */ +int hv_map_msi_interrupt(struct irq_data *data, + struct hv_interrupt_entry *out_entry) { - union hv_device_id device_id = hv_build_pci_dev_id(dev); + struct irq_cfg *cfg = irqd_cfg(data); + struct hv_interrupt_entry dummy; + union hv_device_id device_id; + struct msi_desc *msidesc; + struct pci_dev *dev; + int cpu; - return hv_map_interrupt(device_id, false, cpu, vector, entry); + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + device_id = hv_build_pci_dev_id(dev); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(data)); + + return hv_map_interrupt(device_id, false, cpu, cfg->vector, + out_entry ? out_entry : &dummy); } +EXPORT_SYMBOL_GPL(hv_map_msi_interrupt); static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) { @@ -188,13 +213,11 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { + struct hv_interrupt_entry *stored_entry; + struct irq_cfg *cfg = irqd_cfg(data); struct msi_desc *msidesc; struct pci_dev *dev; - struct hv_interrupt_entry out_entry, *stored_entry; - struct irq_cfg *cfg = irqd_cfg(data); - const cpumask_t *affinity; - int cpu; - u64 status; + int ret; msidesc = irq_data_get_msi_desc(data); dev = msi_desc_to_pci_dev(msidesc); @@ -204,9 +227,6 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - affinity = irq_data_get_effective_affinity_mask(data); - cpu = cpumask_first_and(affinity, cpu_online_mask); - if (data->chip_data) { /* * This interrupt is already mapped. Let's unmap first. @@ -219,14 +239,12 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) stored_entry = data->chip_data; data->chip_data = NULL; - status = hv_unmap_msi_interrupt(dev, stored_entry); + ret = hv_unmap_msi_interrupt(dev, stored_entry); kfree(stored_entry); - if (status != HV_STATUS_SUCCESS) { - hv_status_debug(status, "failed to unmap\n"); + if (ret) return; - } } stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); @@ -235,15 +253,14 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry); - if (status != HV_STATUS_SUCCESS) { + ret = hv_map_msi_interrupt(data, stored_entry); + if (ret) { kfree(stored_entry); return; } - *stored_entry = out_entry; data->chip_data = stored_entry; - entry_to_msi_msg(&out_entry, msg); + entry_to_msi_msg(data->chip_data, msg); return; } @@ -257,7 +274,6 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) { struct hv_interrupt_entry old_entry; struct msi_msg msg; - u64 status; if (!irqd->chip_data) { pr_debug("%s: no chip data\n!", __func__); @@ -270,10 +286,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) kfree(irqd->chip_data); irqd->chip_data = NULL; - status = hv_unmap_msi_interrupt(dev, &old_entry); - - if (status != HV_STATUS_SUCCESS) - hv_status_err(status, "\n"); + (void)hv_unmap_msi_interrupt(dev, &old_entry); } static void hv_msi_free_irq(struct irq_domain *domain, diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 09a165a3c41e..ade6c665c97e 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -9,6 +9,8 @@ #include <linux/bitfield.h> #include <linux/types.h> #include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/export.h> #include <asm/svm.h> #include <asm/sev.h> #include <asm/io.h> @@ -289,7 +291,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) free_page((unsigned long)vmsa); } -int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) +int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu) { struct sev_es_save_area *vmsa = (struct sev_es_save_area *) __get_free_page(GFP_KERNEL | __GFP_ZERO); @@ -298,10 +300,16 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) u64 ret, retry = 5; struct hv_enable_vp_vtl *start_vp_input; unsigned long flags; + int vp_index; if (!vmsa) return -ENOMEM; + /* Find the Hyper-V VP index which might be not the same as APIC ID */ + vp_index = hv_apicid_to_vp_index(apic_id); + if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index) + return -EINVAL; + native_store_gdt(&gdtr); vmsa->gdtr.base = gdtr.address; @@ -349,7 +357,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; memset(start_vp_input, 0, sizeof(*start_vp_input)); start_vp_input->partition_id = -1; - start_vp_input->vp_index = cpu; + start_vp_input->vp_index = vp_index; start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index 1083dc8646f9..8ccbb7c4fc27 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -11,6 +11,7 @@ #include <linux/types.h> +#include <linux/export.h> #include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 5ab1a4598d00..a03aa6f999d1 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -158,13 +158,13 @@ static inline bool acpi_has_cpu_in_madt(void) } #define ACPI_HAVE_ARCH_SET_ROOT_POINTER -static inline void acpi_arch_set_root_pointer(u64 addr) +static __always_inline void acpi_arch_set_root_pointer(u64 addr) { x86_init.acpi.set_root_pointer(addr); } #define ACPI_HAVE_ARCH_GET_ROOT_POINTER -static inline u64 acpi_arch_get_root_pointer(void) +static __always_inline u64 acpi_arch_get_root_pointer(void) { return x86_init.acpi.get_root_pointer(); } diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h deleted file mode 100644 index 2cf5153edbc2..000000000000 --- a/arch/x86/include/asm/amd/fch.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_AMD_FCH_H_ -#define _ASM_X86_AMD_FCH_H_ - -#define FCH_PM_BASE 0xFED80300 - -/* Register offsets from PM base: */ -#define FCH_PM_DECODEEN 0x00 -#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19) -#define FCH_PM_SCRATCH 0x80 -#define FCH_PM_S5_RESET_STATUS 0xC0 - -#endif /* _ASM_X86_AMD_FCH_H_ */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 68e10e30fe9b..07ba4935e873 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -313,9 +313,9 @@ struct apic { u32 (*get_apic_id)(u32 id); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); /* wakeup secondary CPU using 64-bit wakeup point */ - int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); char *name; }; @@ -333,8 +333,8 @@ struct apic_override { void (*send_IPI_self)(int vector); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); - int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); - int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); }; /* @@ -488,11 +488,14 @@ static inline void apic_setup_apic_calls(void) { } extern void apic_ack_irq(struct irq_data *data); +#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) +#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) + static inline bool lapic_vector_set_in_irr(unsigned int vector) { - u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector)); - return !!(irr & (1U << (vector % 32))); + return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector))); } static inline bool is_vector_pending(unsigned int vector) @@ -500,6 +503,65 @@ static inline bool is_vector_pending(unsigned int vector) return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } +#define MAX_APIC_VECTOR 256 +#define APIC_VECTORS_PER_REG 32 + +/* + * Vector states are maintained by APIC in 32-bit registers that are + * 16 bytes aligned. The status of each vector is kept in a single + * bit. + */ +static inline int apic_find_highest_vector(void *bitmap) +{ + int vec; + u32 *reg; + + for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { + reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec); + if (*reg) + return __fls(*reg) + vec; + } + + return -1; +} + +static inline u32 apic_get_reg(void *regs, int reg) +{ + return *((u32 *) (regs + reg)); +} + +static inline void apic_set_reg(void *regs, int reg, u32 val) +{ + *((u32 *) (regs + reg)) = val; +} + +static __always_inline u64 apic_get_reg64(void *regs, int reg) +{ + BUILD_BUG_ON(reg != APIC_ICR); + return *((u64 *) (regs + reg)); +} + +static __always_inline void apic_set_reg64(void *regs, int reg, u64 val) +{ + BUILD_BUG_ON(reg != APIC_ICR); + *((u64 *) (regs + reg)) = val; +} + +static inline void apic_clear_vector(int vec, void *bitmap) +{ + clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); +} + +static inline void apic_set_vector(int vec, void *bitmap) +{ + set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); +} + +static inline int apic_test_vector(int vec, void *bitmap) +{ + return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec)); +} + /* * Warm reset vector position: */ diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h index 2930f560d7f3..e1f965bb1e31 100644 --- a/arch/x86/include/asm/ce4100.h +++ b/arch/x86/include/asm/ce4100.h @@ -4,4 +4,10 @@ int ce4100_pci_init(void); +#ifdef CONFIG_SERIAL_8250 +void __init sdv_serial_fixup(void); +#else +static inline void sdv_serial_fixup(void) {}; +#endif + #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5b50e0e35129..602957dd2609 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -336,6 +336,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ @@ -378,6 +379,7 @@ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ +#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ #define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ @@ -446,6 +448,7 @@ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ #define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ +#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ #define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ @@ -453,10 +456,15 @@ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ #define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ + #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ + +#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ @@ -483,6 +491,9 @@ #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ #define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ +#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ +#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ +#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ /* * BUG word(s) @@ -538,5 +549,5 @@ #define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ - +#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 363110e6b2e3..a2c1f2d24b64 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -9,6 +9,14 @@ #include <asm/cpufeature.h> #include <asm/msr.h> +/* + * Define bits that are always set to 1 in DR7, only bit 10 is + * architecturally reserved to '1'. + * + * This is also the init/reset value for DR7. + */ +#define DR7_FIXED_1 0x00000400 + DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL @@ -100,8 +108,8 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) static inline void hw_breakpoint_disable(void) { - /* Zero the control register for HW Breakpoint */ - set_debugreg(0UL, 7); + /* Reset the control register for HW Breakpoint */ + set_debugreg(DR7_FIXED_1, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); @@ -125,9 +133,12 @@ static __always_inline unsigned long local_db_save(void) return 0; get_debugreg(dr7, 7); - dr7 &= ~0x400; /* architecturally set bit */ + + /* Architecturally set bit */ + dr7 &= ~DR7_FIXED_1; if (dr7) - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); + /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c94121acd3d..93e99d2583d6 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -118,7 +118,7 @@ enum xfeature { XFEATURE_PKRU, XFEATURE_PASID, XFEATURE_CET_USER, - XFEATURE_CET_KERNEL_UNUSED, + XFEATURE_CET_KERNEL, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, @@ -142,7 +142,7 @@ enum xfeature { #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) -#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) @@ -269,6 +269,16 @@ struct cet_user_state { }; /* + * State component 12 is Control-flow Enforcement supervisor states. + * This state includes SSP pointers for privilege levels 0 through 2. + */ +struct cet_supervisor_state { + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; +} __packed; + +/* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). */ @@ -552,6 +562,31 @@ struct fpu_guest { }; /* + * FPU state configuration data for fpu_guest. + * Initialized at boot time. Read only after init. + */ +struct vcpu_fpu_config { + /* + * @size: + * + * The default size of the register state buffer in guest FPUs. + * Includes all supported features except independent managed + * features and features which have to be requested by user space + * before usage. + */ + unsigned int size; + + /* + * @features: + * + * The default supported features bitmap in guest FPUs. Does not + * include independent managed features and features which have to + * be requested by user space before usage. + */ + u64 features; +}; + +/* * FPU state configuration data. Initialized at boot time. Read only after init. */ struct fpu_state_config { @@ -567,8 +602,9 @@ struct fpu_state_config { * @default_size: * * The default size of the register state buffer. Includes all - * supported features except independent managed features and - * features which have to be requested by user space before usage. + * supported features except independent managed features, + * guest-only features and features which have to be requested by + * user space before usage. */ unsigned int default_size; @@ -584,8 +620,8 @@ struct fpu_state_config { * @default_features: * * The default supported features bitmap. Does not include - * independent managed features and features which have to - * be requested by user space before usage. + * independent managed features, guest-only features and features + * which have to be requested by user space before usage. */ u64 default_features; /* @@ -606,5 +642,6 @@ struct fpu_state_config { /* FPU state configuration information */ extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; +extern struct vcpu_fpu_config guest_default_cfg; #endif /* _ASM_X86_FPU_TYPES_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index b308a76afbb7..7a7dc9d56027 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -46,9 +46,13 @@ /* Features which are dynamically enabled for a process on request */ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA +/* Supervisor features which are enabled only in guest FPUs */ +#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ - XFEATURE_MASK_CET_USER) + XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_GUEST_SUPERVISOR) /* * A supervisor state component may not always contain valuable information, @@ -75,8 +79,7 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ - XFEATURE_MASK_CET_KERNEL) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 8b1b1abcef15..5a68e9db6518 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,7 +5,7 @@ #if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 #define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector #else -#define __head __section(".head.text") __no_sanitize_undefined +#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase #endif struct x86_mapping_info { diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 43b7657febca..944637a4e6de 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -59,18 +59,6 @@ struct telemetry_plt_config { }; struct telemetry_core_ops { - int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period, - u8 *ioss_min_period, u8 *ioss_max_period); - - int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig, - struct telemetry_evtconfig *ioss_evtconfig, - int pss_len, int ioss_len); - - int (*update_events)(struct telemetry_evtconfig pss_evtconfig, - struct telemetry_evtconfig ioss_evtconfig); - - int (*set_sampling_period)(u8 pss_period, u8 ioss_period); - int (*get_trace_verbosity)(enum telemetry_unit telem_unit, u32 *verbosity); @@ -84,11 +72,6 @@ struct telemetry_core_ops { int (*read_eventlog)(enum telemetry_unit telem_unit, struct telemetry_evtlog *evtlog, int len, int log_all_evts); - - int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts, - u32 *pss_evtmap, u32 *ioss_evtmap); - - int (*reset_events)(void); }; int telemetry_set_pltdata(const struct telemetry_core_ops *ops, @@ -101,35 +84,15 @@ struct telemetry_plt_config *telemetry_get_pltdata(void); int telemetry_get_evtname(enum telemetry_unit telem_unit, const char **name, int len); -int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig, - struct telemetry_evtconfig ioss_evtconfig); - -int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts, - u32 *pss_evtmap, u32 *ioss_evtmap); - -int telemetry_reset_events(void); - -int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config, - struct telemetry_evtconfig *ioss_config, - int pss_len, int ioss_len); - int telemetry_read_events(enum telemetry_unit telem_unit, struct telemetry_evtlog *evtlog, int len); -int telemetry_raw_read_events(enum telemetry_unit telem_unit, - struct telemetry_evtlog *evtlog, int len); - int telemetry_read_eventlog(enum telemetry_unit telem_unit, struct telemetry_evtlog *evtlog, int len); int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit, struct telemetry_evtlog *evtlog, int len); -int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period, - u8 *ioss_min_period, u8 *ioss_max_period); - -int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period); - int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit, u32 verbosity); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 5036f13ab69f..5a0d42464d44 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,7 +26,22 @@ enum { IRQ_REMAP_X2APIC_MODE, }; -struct vcpu_data { +/* + * This is mainly used to communicate information back-and-forth + * between SVM and IOMMU for setting up and tearing down posted + * interrupt + */ +struct amd_iommu_pi_data { + u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ + u32 ga_tag; + u32 vector; /* Guest vector of the interrupt */ + int cpu; + bool ga_log_intr; + bool is_guest_mode; + void *ir_data; +}; + +struct intel_iommu_pi_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ }; diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 9a9b21b78905..b30e5474c18e 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -44,13 +44,13 @@ static __always_inline void native_irq_enable(void) static __always_inline void native_safe_halt(void) { - mds_idle_clear_cpu_buffers(); + x86_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static __always_inline void native_halt(void) { - mds_idle_clear_cpu_buffers(); + x86_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 823c0434bbad..18a5c3119e1a 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL(vm_pre_destroy) KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) @@ -48,7 +49,6 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) -KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) @@ -111,10 +111,11 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) -KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(pi_start_bypass) KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) @@ -126,6 +127,7 @@ KVM_X86_OP(enable_smi_window) #endif KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) @@ -136,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) -KVM_X86_OP(msr_filter_changed) +KVM_X86_OP(recalc_msr_intercepts) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c971f846108..f19a76d3ca0e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/pvclock-abi.h> +#include <asm/debugreg.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/msr-index.h> @@ -126,7 +127,8 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ + KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -248,7 +250,6 @@ enum x86_intercept_stage; #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) -#define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff #define KVM_GUESTDBG_VALID_MASK \ @@ -296,6 +297,7 @@ enum x86_intercept_stage; */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irqfd; struct kvm_kernel_irq_routing_entry; /* @@ -412,7 +414,6 @@ struct kvm_rmap_head { }; struct kvm_pio_request { - unsigned long linear_rip; unsigned long count; int in; int port; @@ -609,8 +610,15 @@ struct kvm_pmu { struct kvm_pmu_ops; enum { - KVM_DEBUGREG_BP_ENABLED = 1, - KVM_DEBUGREG_WONT_EXIT = 2, + KVM_DEBUGREG_BP_ENABLED = BIT(0), + KVM_DEBUGREG_WONT_EXIT = BIT(1), + /* + * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by + * hardware on exit from or enter to guest. KVM needn't switch them. + * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM + * exit, host values need to be restored. + */ + KVM_DEBUGREG_AUTO_SWITCH = BIT(2), }; struct kvm_mtrr { @@ -693,8 +701,13 @@ struct kvm_vcpu_hv { struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; - /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ + /* + * Preallocated buffers for handling hypercalls that pass sparse vCPU + * sets (for high vCPU counts, they're too large to comfortably fit on + * the stack). + */ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); struct hv_vp_assist_page vp_assist_page; @@ -757,6 +770,7 @@ enum kvm_only_cpuid_leafs { CPUID_8000_0022_EAX, CPUID_7_2_EDX, CPUID_24_0_EBX, + CPUID_8000_0021_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -911,6 +925,7 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); + unsigned long cui_linear_rip; gpa_t time; s8 pvclock_tsc_shift; @@ -1028,6 +1043,7 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + int highest_stale_pending_ioapic_eoi; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; @@ -1305,6 +1321,12 @@ enum kvm_apicv_inhibit { */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + /* + * AVIC is disabled because the vCPU's APIC ID is beyond the max + * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, + NR_APICV_INHIBIT_REASONS, }; @@ -1323,7 +1345,8 @@ enum kvm_apicv_inhibit { __APICV_INHIBIT_REASON(IRQWIN), \ __APICV_INHIBIT_REASON(PIT_REINJ), \ __APICV_INHIBIT_REASON(SEV), \ - __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1335,7 +1358,7 @@ struct kvm_arch { bool has_private_mem; bool has_protected_state; bool pre_fault_allowed; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; /* * A list of kvm_mmu_page structs that, if zapped, could possibly be @@ -1364,11 +1387,13 @@ struct kvm_arch { #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; -#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE - atomic_t assigned_device_count; + unsigned long nr_possible_bypass_irqs; + +#ifdef CONFIG_KVM_IOAPIC struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; +#endif atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map __rcu *apic_map; @@ -1383,12 +1408,8 @@ struct kvm_arch { gpa_t wall_clock; - bool mwait_in_guest; - bool hlt_in_guest; - bool pause_in_guest; - bool cstate_in_guest; + u64 disabled_exits; - unsigned long irq_sources_bitmap; s64 kvmclock_offset; /* @@ -1417,9 +1438,6 @@ struct kvm_arch { struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; - /* reads protected by irq_srcu, writes by irq_lock */ - struct hlist_head mask_notifier_list; - #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; #endif @@ -1442,6 +1460,7 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + bool has_mapped_host_mmio; bool guest_can_read_msr_platform_info; bool exception_payload_enabled; @@ -1571,6 +1590,13 @@ struct kvm_arch { struct kvm_mmu_memory_cache split_desc_cache; gfn_t gfn_direct_bits; + + /* + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero + * value indicates CPU dirty logging is unsupported or disabled in + * current VM. + */ + int cpu_dirty_log_size; }; struct kvm_vm_stat { @@ -1658,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; } +enum kvm_x86_run_flags { + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), + KVM_RUN_LOAD_GUEST_DR6 = BIT(1), + KVM_RUN_LOAD_DEBUGCTL = BIT(2), +}; + struct kvm_x86_ops { const char *name; @@ -1674,6 +1706,7 @@ struct kvm_x86_ops { unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); + void (*vm_pre_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ int (*vcpu_precreate)(struct kvm *kvm); @@ -1685,6 +1718,12 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); + /* + * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to + * match the host's value even while the guest is active. + */ + const u64 HOST_OWNED_DEBUGCTL; + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -1707,7 +1746,6 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); @@ -1738,7 +1776,7 @@ struct kvm_x86_ops { int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, - bool force_immediate_exit); + u64 run_flags); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1823,11 +1861,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - /* - * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero - * value indicates CPU dirty logging is unsupported or disabled. - */ - int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; @@ -1835,12 +1868,14 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); - void (*pi_start_assignment)(struct kvm *kvm); + int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector); + void (*pi_start_bypass)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); + bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); @@ -1857,6 +1892,7 @@ struct kvm_x86_ops { int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); @@ -1872,7 +1908,7 @@ struct kvm_x86_ops { int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); - void (*msr_filter_changed)(struct kvm_vcpu *vcpu); + void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); @@ -1930,6 +1966,8 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_ipiv; +extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) @@ -1947,7 +1985,7 @@ void kvm_x86_vendor_exit(void); #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT); } #define __KVM_HAVE_ARCH_VM_FREE @@ -1992,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -void kvm_mmu_init_vm(struct kvm *kvm); +int kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, @@ -2023,19 +2061,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -struct kvm_irq_mask_notifier { - void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); - int irq; - struct hlist_node link; -}; - -void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn); -void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn); -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask); - extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); @@ -2194,9 +2219,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); -void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); - void kvm_inject_nmi(struct kvm_vcpu *vcpu); int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); @@ -2333,6 +2355,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +void kvm_user_return_msr_update_cache(unsigned int index, u64 val); static inline bool kvm_is_supported_user_return_msr(u32 msr) { @@ -2372,9 +2395,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq); - static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ @@ -2416,7 +2436,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ KVM_X86_QUIRK_SLOT_ZAP_ALL | \ - KVM_X86_QUIRK_STUFF_FEATURE_MSRS) + KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) + +#define KVM_X86_CONDITIONAL_QUIRKS \ + (KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was @@ -2427,7 +2452,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); static inline bool kvm_arch_has_irq_bypass(void) { - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); + return enable_device_posted_irqs; } #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e988bac0a4a1..3c2de4ce3b10 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,20 @@ #include <asm-generic/module.h> #include <asm/orc_types.h> +struct its_array { +#ifdef CONFIG_MITIGATION_ITS + void **pages; + int num; +#endif +}; + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif + struct its_array its_pages; }; #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 778444310cfb..abc4659f5809 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -112,12 +112,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_status; } -/* Hypercall to the L0 hypervisor */ -static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output) -{ - return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output); -} - /* Fast hypercall with 8 bytes of input and no output */ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) { @@ -165,13 +159,6 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) return _hv_do_fast_hypercall8(control, input1); } -static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1) -{ - u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED; - - return _hv_do_fast_hypercall8(control, input1); -} - /* Fast hypercall with 16 bytes of input */ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) { @@ -223,13 +210,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) return _hv_do_fast_hypercall16(control, input1, input2); } -static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2) -{ - u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED; - - return _hv_do_fast_hypercall16(control, input1, input2); -} - extern struct hv_vp_assist_page **hv_vp_assist_page; static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) @@ -262,6 +242,8 @@ static inline void hv_apic_init(void) {} struct irq_domain *hv_create_pci_msi_domain(void); +int hv_map_msi_interrupt(struct irq_data *data, + struct hv_interrupt_entry *out_entry); int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, struct hv_interrupt_entry *entry); int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); @@ -269,11 +251,12 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -int hv_snp_boot_ap(u32 cpu, unsigned long start_ip); +int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu); #else static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; } +static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, + unsigned int cpu) { return 0; } #endif #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) @@ -307,6 +290,7 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg) { return native_rdmsrq(reg); } +int hv_apicid_to_vp_index(u32 apic_id); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} @@ -328,6 +312,7 @@ static inline void hv_set_msr(unsigned int reg, u64 value) { } static inline u64 hv_get_msr(unsigned int reg) { return 0; } static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { } static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } +static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; } #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b7dded3c8113..b65c3ba5fa14 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -419,6 +419,7 @@ #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) +#define DEBUGCTLMSR_RTM_DEBUG BIT(15) #define MSR_PEBS_FRONTEND 0x000003f7 @@ -628,6 +629,7 @@ #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD_PPIN_CTL 0xc00102f0 #define MSR_AMD_PPIN 0xc00102f1 +#define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 @@ -732,6 +734,11 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Hardware Feedback Support MSRs */ +#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 +#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501 +#define MSR_AMD_WORKLOAD_HRST 0xc0000502 + /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e @@ -830,6 +837,7 @@ #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_HWCR_IRPERF_EN_BIT 30 #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) +#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35 #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K7_HWCR_CPB_DIS_BIT 25 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 4096b8af4ba7..9c2ea29e12a9 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -228,7 +228,7 @@ static __always_inline u64 rdpmc(int counter) #endif /* !CONFIG_PARAVIRT_XXL */ /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ -#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) +#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) /* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ static __always_inline void wrmsrns(u32 msr, u64 val) @@ -237,7 +237,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val) * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant * DS prefix to avoid a trailing NOP. */ - asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) + asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS) "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); } diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index dd2b129b0418..6ca6516c7492 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,8 +43,6 @@ static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) static __always_inline void __mwait(u32 eax, u32 ecx) { - mds_idle_clear_cpu_buffers(); - /* * Use the instruction mnemonic with implicit operands, as the LLVM * assembler fails to assemble the mnemonic with explicit operands: @@ -80,7 +78,7 @@ static __always_inline void __mwait(u32 eax, u32 ecx) */ static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) { - /* No MDS buffer clear as this is AMD/HYGON only */ + /* No need for TSA buffer clearing on AMD */ /* "mwaitx %eax, %ebx, %ecx" */ asm volatile(".byte 0x0f, 0x01, 0xfb" @@ -98,7 +96,6 @@ static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) */ static __always_inline void __sti_mwait(u32 eax, u32 ecx) { - mds_idle_clear_cpu_buffers(); asm volatile("sti; mwait" :: "a" (eax), "c" (ecx)); } @@ -115,21 +112,29 @@ static __always_inline void __sti_mwait(u32 eax, u32 ecx) */ static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx) { + if (need_resched()) + return; + + x86_idle_clear_cpu_buffers(); + if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { const void *addr = ¤t_thread_info()->flags; alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); __monitor(addr, 0, 0); - if (!need_resched()) { - if (ecx & 1) { - __mwait(eax, ecx); - } else { - __sti_mwait(eax, ecx); - raw_local_irq_disable(); - } + if (need_resched()) + goto out; + + if (ecx & 1) { + __mwait(eax, ecx); + } else { + __sti_mwait(eax, ecx); + raw_local_irq_disable(); } } + +out: current_clr_polling(); } diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 20d754b98f3f..10f261678749 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -302,25 +302,31 @@ .endm /* - * Macro to execute VERW instruction that mitigate transient data sampling - * attacks such as MDS. On affected systems a microcode update overloaded VERW - * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. - * + * Macro to execute VERW insns that mitigate transient data sampling + * attacks such as MDS or TSA. On affected systems a microcode update + * overloaded VERW insns to also clear the CPU buffers. VERW clobbers + * CFLAGS.ZF. * Note: Only the memory operand variant of VERW clears the CPU buffers. */ -.macro CLEAR_CPU_BUFFERS +.macro __CLEAR_CPU_BUFFERS feature #ifdef CONFIG_X86_64 - ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF + ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature #else /* * In 32bit mode, the memory operand must be a %cs reference. The data * segments may not be usable (vm86 mode), and the stack segment may not * be flat (ESPFIX32). */ - ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF + ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature #endif .endm +#define CLEAR_CPU_BUFFERS \ + __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF + +#define VM_CLEAR_CPU_BUFFERS \ + __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM + #ifdef CONFIG_X86_64 .macro CLEAR_BRANCH_HISTORY ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP @@ -567,24 +573,24 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_idle_clear); +DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); -extern u16 mds_verw_sel; +extern u16 x86_verw_sel; #include <asm/segment.h> /** - * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ -static __always_inline void mds_clear_cpu_buffers(void) +static __always_inline void x86_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; @@ -601,14 +607,15 @@ static __always_inline void mds_clear_cpu_buffers(void) } /** - * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability + * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS + * and TSA vulnerabilities. * * Clear CPU buffers if the corresponding static key is enabled */ -static __always_inline void mds_idle_clear_cpu_buffers(void) +static __always_inline void x86_idle_clear_cpu_buffers(void) { - if (static_branch_likely(&mds_idle_clear)) - mds_clear_cpu_buffers(); + if (static_branch_likely(&cpu_buf_idle_clear)) + x86_clear_cpu_buffers(); } #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5ddba366d3b4..97954c936c54 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + /* This bit combination is used to mark shadow stacks */ + WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == + _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); @@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * (Currently stuck as a macro because of indirect forward reference - * to linux/mm.h:page_to_nid()) - */ -#define mk_pte(page, pgprot) \ -({ \ - pgprot_t __pgprot = pgprot; \ - \ - WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ - _PAGE_DIRTY); \ - pfn_pte(page_to_pfn(page), __pgprot); \ -}) - static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != @@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, @@ -1576,7 +1561,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte) return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b74ec5c3643b..a5731fb1e9dd 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -214,9 +214,6 @@ enum page_cache_mode { #define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) -#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) - /* * Page tables needs to have Write=1 in order for any lower PTEs to be * writable. This includes shadow stack memory (Write=0, Dirty=1) diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index de788b400fba..a5f761fbf45b 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -1,19 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H + +#include <asm/cmpxchg.h> +#include <asm/rwonce.h> #include <asm/irq_vectors.h> +#include <linux/bitmap.h> + #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 #define PID_TABLE_ENTRY_VALID 1 +#define NR_PIR_VECTORS 256 +#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) + /* Posted-Interrupt Descriptor */ struct pi_desc { - union { - u32 pir[8]; /* Posted interrupt requested */ - u64 pir64[4]; - }; + unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During software processing of posted interrupts, the CPU needs to do + * natural width read and xchg for checking and clearing posted interrupt + * request (PIR), a 256 bit field within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows, + * assuming a 64-bit kernel: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * when processing posted interrupts in software, e.g. to dispatch interrupt + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR + * in KVM. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool pi_harvest_pir(unsigned long *pir, + unsigned long *pir_vals) +{ + unsigned long pending = 0; + int i; + + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { + if (!pir_vals[i]) + continue; + + pir_vals[i] = arch_xchg(&pir[i], 0); + } + + return true; +} + static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); @@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); + return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); + return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) @@ -81,6 +145,11 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } +static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc) +{ + return test_bit(vector, (unsigned long *)pi_desc->pir); +} + /* Non-atomic helpers */ static inline void __pi_set_sn(struct pi_desc *pi_desc) { @@ -105,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector) if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; - return test_bit(vector, (unsigned long *)pid->pir); + return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index f607081a022a..e406a1e92c63 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -78,7 +78,7 @@ extern unsigned char secondary_startup_64[]; extern unsigned char secondary_startup_64_no_verify[]; #endif -static inline size_t real_mode_size_needed(void) +static __always_inline size_t real_mode_size_needed(void) { if (real_mode_header) return 0; /* already allocated. */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index bd6afe805cf6..feb93b50e990 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -177,7 +177,7 @@ static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, return READ_ONCE(tsk->rmid) == rmid; } -static inline void resctrl_sched_in(struct task_struct *tsk) +static inline void resctrl_arch_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); @@ -196,25 +196,22 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) /* x86 can always read an rmid, nothing needs allocating */ struct rdt_resource; -static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) { might_sleep(); return NULL; -}; +} -static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, - void *ctx) { }; +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, + void *ctx) { } -u64 resctrl_arch_get_prefetch_disable_bits(void); -int resctrl_arch_pseudo_lock_fn(void *_plr); -int resctrl_arch_measure_cycles_lat_fn(void *_plr); -int resctrl_arch_measure_l2_residency(void *_plr); -int resctrl_arch_measure_l3_residency(void *_plr); void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else -static inline void resctrl_sched_in(struct task_struct *tsk) {} +static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 8d9f1c9aaa4c..61f56cdaccb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -4,6 +4,7 @@ #include <asm/page.h> #include <asm-generic/set_memory.h> +#include <asm/pgtable.h> #define set_memory_rox set_memory_rox int set_memory_rox(unsigned long addr, int numpages); @@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages); * The caller is required to take care of these. */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot); int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); int _set_memory_wt(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 6324f4c6c545..692af46603a1 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +#include <linux/kexec_handover.h> + #ifndef _SETUP #include <asm/espfix.h> diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 6158893786d6..89075ff19afa 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -223,6 +223,18 @@ struct snp_tsc_info_resp { u8 rsvd2[100]; } __packed; +/* + * Obtain the mean TSC frequency by decreasing the nominal TSC frequency with + * TSC_FACTOR as documented in the SNP Firmware ABI specification: + * + * GUEST_TSC_FREQ * (1 - (TSC_FACTOR * 0.00001)) + * + * which is equivalent to: + * + * GUEST_TSC_FREQ -= (GUEST_TSC_FREQ * TSC_FACTOR) / 100000; + */ +#define SNP_SCALE_TSC_FREQ(freq, factor) ((freq) - (freq) * (factor) / 100000) + struct snp_guest_req { void *req_buf; size_t req_sz; @@ -231,6 +243,7 @@ struct snp_guest_req { size_t resp_sz; u64 exit_code; + u64 exitinfo2; unsigned int vmpck_id; u8 msg_version; u8 msg_type; @@ -282,8 +295,11 @@ struct snp_secrets_page { u8 svsm_guest_vmpl; u8 rsvd3[3]; + /* The percentage decrease from nominal to mean TSC frequency. */ + u32 tsc_factor; + /* Remainder of page */ - u8 rsvd4[3744]; + u8 rsvd4[3740]; } __packed; struct snp_msg_desc { @@ -415,6 +431,10 @@ struct svsm_call { #define SVSM_ATTEST_SERVICES 0 #define SVSM_ATTEST_SINGLE_SERVICE 1 +#define SVSM_VTPM_CALL(x) ((2ULL << 32) | (x)) +#define SVSM_VTPM_QUERY 0 +#define SVSM_VTPM_CMD 1 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern u8 snp_vmpl; @@ -441,7 +461,7 @@ static __always_inline void sev_es_nmi_complete(void) cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_nmi_complete(); } -extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd); extern void sev_enable(struct boot_params *bp); /* @@ -482,8 +502,6 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } -struct snp_guest_request_ioctl; - void setup_ghcb(void); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); @@ -509,8 +527,9 @@ void snp_kexec_begin(void); int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id); struct snp_msg_desc *snp_msg_alloc(void); void snp_msg_free(struct snp_msg_desc *mdesc); -int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio); +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req); + +int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); @@ -550,7 +569,7 @@ static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } -static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline int sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { return 0; } static inline void sev_enable(struct boot_params *bp) { } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } @@ -581,8 +600,9 @@ static inline void snp_kexec_begin(void) { } static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; } static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } -static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) { return -ENODEV; } +static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, + struct snp_guest_req *req) { return -ENODEV; } +static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index a28ff6b14145..8bc074c8d7c6 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -13,6 +13,7 @@ /* TDX module Call Leaf IDs */ #define TDG_VP_VMCALL 0 #define TDG_VP_INFO 1 +#define TDG_MR_RTMR_EXTEND 2 #define TDG_VP_VEINFO_GET 3 #define TDG_MR_REPORT 4 #define TDG_MEM_PAGE_ACCEPT 6 @@ -67,11 +68,20 @@ #define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT) /* TDX hypercall Leaf IDs */ +#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000 #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT 0x10004ULL -#define TDVMCALL_STATUS_RETRY 1 +/* + * TDG.VP.VMCALL Status Codes (returned in R10) + */ +#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL +#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL +#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL +#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL +#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL /* * Bitmasks of exposed registers (with VMM). diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index e770c4fc47f4..8727c7e21dd1 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +/* + * To prevent immediate repeat of single step trap on return from SIGTRAP + * handler if the trap flag (TF) is set without an external debugger attached, + * clear the software event flag in the augmented SS, ensuring no single-step + * trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original + * state is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ +#ifdef CONFIG_X86_FRED + if (!(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +#endif +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 0c1c68039d6f..22bfebe6776d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -112,7 +112,10 @@ void __noreturn hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); -int wbinvd_on_all_cpus(void); +void wbinvd_on_all_cpus(void); +void wbinvd_on_cpus_mask(struct cpumask *cpus); +void wbnoinvd_on_all_cpus(void); +void wbnoinvd_on_cpus_mask(struct cpumask *cpus); void smp_kick_mwait_play_dead(void); void __noreturn mwait_play_dead(unsigned int eax_hint); @@ -148,10 +151,24 @@ static inline struct cpumask *cpu_l2c_shared_mask(int cpu) #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() -static inline int wbinvd_on_all_cpus(void) +static inline void wbinvd_on_all_cpus(void) { wbinvd(); - return 0; +} + +static inline void wbinvd_on_cpus_mask(struct cpumask *cpus) +{ + wbinvd(); +} + +static inline void wbnoinvd_on_all_cpus(void) +{ + wbnoinvd(); +} + +static inline void wbnoinvd_on_cpus_mask(struct cpumask *cpus) +{ + wbnoinvd(); } static inline struct cpumask *cpu_llc_shared_mask(int cpu) diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index ecda17efa042..fde2bd7af19e 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -104,9 +104,36 @@ static inline void wrpkru(u32 pkru) } #endif +/* + * Write back all modified lines in all levels of cache associated with this + * logical processor to main memory, and then invalidate all caches. Depending + * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect + * lower level caches associated with another logical processor that shares any + * level of this processor's cache hierarchy. + */ static __always_inline void wbinvd(void) { - asm volatile("wbinvd": : :"memory"); + asm volatile("wbinvd" : : : "memory"); +} + +/* Instruction encoding provided for binutils backwards compatibility. */ +#define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09) + +/* + * Write back all modified lines in all levels of cache associated with this + * logical processor to main memory, but do NOT explicitly invalidate caches, + * i.e. leave all/most cache lines in the hierarchy in non-modified state. + */ +static __always_inline void wbnoinvd(void) +{ + /* + * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even + * though WBNOINVD is backwards compatible (it's simply WBINVD with an + * ignored REP prefix), to guarantee that WBNOINVD isn't used if it + * needs to be avoided for any reason. For all supported usage in the + * kernel, WBINVD is functionally a superset of WBNOINVD. + */ + alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD); } static inline unsigned long __read_cr4(void) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 9b7fa99ae951..ffc27f676243 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -116,6 +116,7 @@ enum { INTERCEPT_INVPCID, INTERCEPT_MCOMMIT, INTERCEPT_TLBSYNC, + INTERCEPT_BUSLOCK, INTERCEPT_IDLE_HLT = 166, }; @@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ - u8 reserved_8[720]; + u8 reserved_8[16]; + u16 bus_lock_counter; /* Offset 0x120 */ + u8 reserved_9[22]; + u64 allowed_sev_features; /* Offset 0x138 */ + u64 guest_sev_features; /* Offset 0x140 */ + u8 reserved_10[664]; /* * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. @@ -246,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) +/* + * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible + * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate + * GA log interrupts to wake the vCPU (because it's blocking or about to block). + */ +#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61) + #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12) #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) #define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL) #define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF @@ -284,13 +295,13 @@ enum avic_ipi_failure_cause { static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7c488ff0c764..c10dbb74cd00 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_ax; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_ax = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[5] = regs->bp; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->bx = args[0]; + regs->cx = args[1]; + regs->dx = args[2]; + regs->si = args[3]; + regs->di = args[4]; + regs->bp = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 4a1922ec80cf..7ddef3a69866 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,6 +5,7 @@ #include <linux/init.h> #include <linux/bits.h> +#include <linux/mmzone.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -18,6 +19,7 @@ * TDX module. */ #define TDX_ERROR _BITUL(63) +#define TDX_NON_RECOVERABLE _BITUL(62) #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) @@ -33,6 +35,8 @@ #ifndef __ASSEMBLER__ #include <uapi/asm/mce.h> +#include <asm/tdx_global_metadata.h> +#include <linux/pgtable.h> /* * Used by the #VE exception handler to gather the #VE exception @@ -64,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs); int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); +int tdx_mcall_extend_rtmr(u8 index, u8 *data); + u64 tdx_hcall_get_quote(u8 *buf, size_t size); void __init tdx_dump_attributes(u64 td_attr); @@ -100,7 +106,7 @@ void tdx_init(void); typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); -static inline u64 sc_retry(sc_func_t func, u64 fn, +static __always_inline u64 sc_retry(sc_func_t func, u64 fn, struct tdx_module_args *args) { int retry = RDRAND_RETRY_LOOPS; @@ -119,11 +125,82 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); + +int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); +void tdx_guest_keyid_free(unsigned int keyid); + +struct tdx_td { + /* TD root structure: */ + struct page *tdr_page; + + int tdcs_nr_pages; + /* TD control structure: */ + struct page **tdcs_pages; + + /* Size of `tdcx_pages` in struct tdx_vp */ + int tdcx_nr_pages; +}; + +struct tdx_vp { + /* TDVP root page */ + struct page *tdvpr_page; + + /* TD vCPU control structure: */ + struct page **tdcx_pages; +}; + +static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) +{ + u64 ret; + + ret = page_to_phys(page); + /* KeyID bits are just above the physical address bits: */ + ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; + + return ret; +} + +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; +} + +u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mng_key_config(struct tdx_td *td); +u64 tdh_mng_create(struct tdx_td *td, u16 hkid); +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mr_finalize(struct tdx_td *td); +u64 tdh_vp_flush(struct tdx_vp *vp); +u64 tdh_mng_vpflushdone(struct tdx_td *td); +u64 tdh_mng_key_freeid(struct tdx_td *td); +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_mem_track(struct tdx_td *tdr); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_phymem_cache_wb(bool resume); +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h index 6dd3c9695f59..060a2ad744bf 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -17,9 +17,28 @@ struct tdx_sys_info_tdmr { u16 pamt_1g_entry_size; }; +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + struct tdx_sys_info { struct tdx_sys_info_features features; struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; }; #endif diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 0454d5e60e5d..721b408d9a67 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -44,16 +44,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - -DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -64,11 +54,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_init_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8707361b24da..cca7d6641287 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -256,6 +256,7 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + SHARED_EPT_POINTER = 0x0000203C, PID_POINTER_TABLE = 0x00002042, PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, @@ -586,6 +587,7 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_PROT_READ BIT(3) #define EPT_VIOLATION_PROT_WRITE BIT(4) #define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6) #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ EPT_VIOLATION_PROT_WRITE | \ EPT_VIOLATION_PROT_EXEC) diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index 0007ba077c0c..41da492dfb01 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -15,7 +15,26 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ -/* Define reserved bits in DR6 which are always set to 1 */ +/* + * Define bits in DR6 which are set to 1 by default. + * + * This is also the DR6 architectural value following Power-up, Reset or INIT. + * + * Note, with the introduction of Bus Lock Detection (BLD) and Restricted + * Transactional Memory (RTM), the DR6 register has been modified: + * + * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports + * Bus Lock Detection. The assertion of a bus lock could clear it. + * + * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports + * restricted transactional memory. #DB occurred inside an RTM region + * could clear it. + * + * Apparently, DR6.BLD and DR6.RTM are active low bits. + * + * As a result, DR6_RESERVED is an incorrect name now, but it is kept for + * compatibility. + */ #define DR6_RESERVED (0xFFFF0FF0) #define DR_TRAP0 (0x1) /* db0 */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 460306b35a4b..0f15d683817d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -441,6 +441,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) +#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -844,6 +845,7 @@ struct kvm_sev_snp_launch_start { }; /* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_PAGE_TYPE_INVALID 0x0 #define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 #define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 #define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 @@ -930,4 +932,80 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SNP_VM 4 #define KVM_X86_TDX_VM 5 +/* Trust Domain eXtension sub-ioctl() commands. */ +enum kvm_tdx_cmd_id { + KVM_TDX_CAPABILITIES = 0, + KVM_TDX_INIT_VM, + KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, + KVM_TDX_FINALIZE_VM, + KVM_TDX_GET_CPUID, + + KVM_TDX_CMD_NR_MAX, +}; + +struct kvm_tdx_cmd { + /* enum kvm_tdx_cmd_id */ + __u32 id; + /* flags for sub-commend. If sub-command doesn't use this, set zero. */ + __u32 flags; + /* + * data for each sub-command. An immediate or a pointer to the actual + * data in process virtual address. If sub-command doesn't use it, + * set zero. + */ + __u64 data; + /* + * Auxiliary error code. The sub-command may return TDX SEAMCALL + * status code in addition to -Exxx. + */ + __u64 hw_error; +}; + +struct kvm_tdx_capabilities { + __u64 supported_attrs; + __u64 supported_xfam; + + __u64 kernel_tdvmcallinfo_1_r11; + __u64 user_tdvmcallinfo_1_r11; + __u64 kernel_tdvmcallinfo_1_r12; + __u64 user_tdvmcallinfo_1_r12; + + __u64 reserved[250]; + + /* Configurable CPUID bits for userspace */ + struct kvm_cpuid2 cpuid; +}; + +struct kvm_tdx_init_vm { + __u64 attributes; + __u64 xfam; + __u64 mrconfigid[6]; /* sha384 digest */ + __u64 mrowner[6]; /* sha384 digest */ + __u64 mrownerconfig[6]; /* sha384 digest */ + + /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */ + __u64 reserved[12]; + + /* + * Call KVM_TDX_INIT_VM before vcpu creation, thus before + * KVM_SET_CPUID2. + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the + * TDX module directly virtualizes those CPUIDs without VMM. The user + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX + * module doesn't virtualize. + */ + struct kvm_cpuid2 cpuid; +}; + +#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h index 50c45ead4e7c..2671c4e1b3a0 100644 --- a/arch/x86/include/uapi/asm/setup_data.h +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -13,7 +13,8 @@ #define SETUP_CC_BLOB 7 #define SETUP_IMA 8 #define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED +#define SETUP_KEXEC_KHO 10 +#define SETUP_ENUM_MAX SETUP_KEXEC_KHO #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) @@ -78,6 +79,16 @@ struct ima_setup_data { __u64 size; } __attribute__((packed)); +/* + * Locations of kexec handover metadata + */ +struct kho_data { + __u64 fdt_addr; + __u64 fdt_size; + __u64 scratch_addr; + __u64 scratch_size; +} __attribute__((packed)); + #endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index ec1321248dac..9c640a521a67 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -95,6 +95,7 @@ #define SVM_EXIT_CR14_WRITE_TRAP 0x09e #define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 +#define SVM_EXIT_BUS_LOCK 0x0a5 #define SVM_EXIT_IDLE_HLT 0x0a6 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 @@ -225,6 +226,7 @@ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ + { SVM_EXIT_BUS_LOCK, "buslock" }, \ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index a5faf6d88f1b..f0f4a4cf84a7 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -34,6 +34,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_SIPI_SIGNAL 4 +#define EXIT_REASON_OTHER_SMI 6 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -92,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_TDCALL 77 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -155,7 +157,8 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ - { EXIT_REASON_NOTIFY, "NOTIFY" } + { EXIT_REASON_NOTIFY, "NOTIFY" }, \ + { EXIT_REASON_TDCALL, "TDCALL" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 99a783fd4691..0d2a6d953be9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index f36f28405dcc..6d7603511f52 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -126,7 +126,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) return 0; } -static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) +static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu) { if (!acpi_mp_wake_mailbox_paddr) { pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ecfe7b497cad..ea1d984166cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,24 @@ static struct module *its_mod; #endif static void *its_page; static unsigned int its_offset; +struct its_array its_pages; + +static void *__its_alloc(struct its_array *pages) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + if (!page) + return NULL; + + void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + pages->pages = tmp; + pages->pages[pages->num++] = page; + + return no_free_ptr(page); +} /* Initialize a thunk with the "jmp *reg; int3" instructions. */ static void *its_init_thunk(void *thunk, int reg) @@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg) return thunk + offset; } +static void its_pages_protect(struct its_array *pages) +{ + for (int i = 0; i < pages->num; i++) { + void *page = pages->pages[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +static void its_fini_core(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + its_pages_protect(&its_pages); + kfree(its_pages.pages); +} + #ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { @@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; - execmem_restore_rox(page, PAGE_SIZE); - } + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + its_pages_protect(&mod->arch.its_pages); } void its_free_mod(struct module *mod) @@ -184,37 +215,33 @@ void its_free_mod(struct module *mod) if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_free(page); } - kfree(mod->its_page_array); + kfree(mod->arch.its_pages.pages); } #endif /* CONFIG_MODULES */ static void *its_alloc(void) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); - - if (!page) - return NULL; + struct its_array *pages = &its_pages; + void *page; #ifdef CONFIG_MODULES - if (its_mod) { - void *tmp = krealloc(its_mod->its_page_array, - (its_mod->its_num_pages+1) * sizeof(void *), - GFP_KERNEL); - if (!tmp) - return NULL; + if (its_mod) + pages = &its_mod->arch.its_pages; +#endif - its_mod->its_page_array = tmp; - its_mod->its_page_array[its_mod->its_num_pages++] = page; + page = __its_alloc(pages); + if (!page) + return NULL; - execmem_make_temp_rw(page, PAGE_SIZE); - } -#endif /* CONFIG_MODULES */ + execmem_make_temp_rw(page, PAGE_SIZE); + if (pages == &its_pages) + set_memory_x((unsigned long)page, 1); - return no_free_ptr(page); + return page; } static void *its_allocate_thunk(int reg) @@ -268,7 +295,9 @@ u8 *its_static_thunk(int reg) return thunk; } -#endif +#else +static inline void its_fini_core(void) {} +#endif /* CONFIG_MITIGATION_ITS */ /* * Nomenclature for variable names to simplify and clarify this code and ease @@ -2338,6 +2367,8 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); + its_fini_core(); + /* * Adjust all CALL instructions to point to func()-10, including * those in .altinstr_replacement. @@ -3107,6 +3138,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b5bb7a2e8340..58abb941c45b 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -27,7 +27,13 @@ static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } -static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } + +static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, + unsigned int cpu) +{ + return -1; +} + static u64 noop_apic_icr_read(void) { return 0; } static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index e272bc7fdc8e..5c5be2d58242 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -57,7 +57,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val) numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); } -static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ba5a4ccda37a..5ba2feb2c04c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2225,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic) /* Handle device tree enumerated APICs proper */ if (cfg->dev) { - fn = of_node_to_fwnode(cfg->dev); + fn = of_fwnode_handle(cfg->dev); } else { fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fee42a73d64a..a947b46a8b64 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -183,6 +183,7 @@ setnew: apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); per_cpu(vector_irq, newcpu)[newvec] = desc; + apic_update_irq_cfg(irqd, newvec, newcpu); } static void vector_assign_managed_shutdown(struct irq_data *irqd) @@ -261,7 +262,6 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) if (vector < 0) return vector; apic_update_vector(irqd, vector, cpu); - apic_update_irq_cfg(irqd, vector, cpu); return 0; } @@ -338,7 +338,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) if (vector < 0) return vector; apic_update_vector(irqd, vector, cpu); - apic_update_irq_cfg(irqd, vector, cpu); + return 0; } @@ -864,7 +864,7 @@ void lapic_offline(void) __vector_cleanup(cl, false); irq_matrix_offline(vector_matrix); - WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0); WARN_ON_ONCE(!hlist_empty(&cl->head)); unlock_vector_lock(); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7fef504ca508..15209f220e1f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -667,7 +667,7 @@ static __init void build_uv_gr_table(void) } } -static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu) { unsigned long val; int pnode; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 93da466dfe2c..a5ece6ebe8a7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,7 +9,7 @@ #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> -#include <asm/amd/fch.h> +#include <linux/platform_data/x86/amd-fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -31,7 +31,7 @@ #include "cpu.h" -u16 invlpgb_count_max __ro_after_init; +u16 invlpgb_count_max __ro_after_init = 1; static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { @@ -377,6 +377,47 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c) #endif } +#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \ + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \ + step, step, ucode) + +static const struct x86_cpu_id amd_tsa_microcode[] = { + ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7), + ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b), + ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d), + ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c), + ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c), + ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109), + ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e), + ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211), + ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108), + ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012), + ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a), + ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108), + ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208), + ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008), + ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008), + ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216), + {}, +}; + +static void tsa_init(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (cpu_has(c, X86_FEATURE_ZEN3) || + cpu_has(c, X86_FEATURE_ZEN4)) { + if (x86_match_min_microcode_rev(amd_tsa_microcode)) + setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR); + else + pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode); + } else { + setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO); + setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO); + } +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -489,6 +530,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } bsp_determine_snp(c); + tsa_init(c); + + if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID)) + setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT); + return; warn: @@ -930,6 +976,16 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) init_spectral_chicken(c); fix_erratum_1386(c); zen2_zenbleed_check(c); + + /* Disable RDSEED on AMD Cyan Skillfish because of an error. */ + if (c->x86_model == 0x47 && c->x86_stepping == 0x0) { + clear_cpu_cap(c, X86_FEATURE_RDSEED); + msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18); + pr_emerg("RDSEED is not reliable on this platform; disabling.\n"); + } + + /* Correct misconfigured CPUID on some clients. */ + clear_cpu_cap(c, X86_FEATURE_INVLPGB); } static void init_amd_zen3(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7f94e6a5497d..b74bf937cd9f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -94,6 +94,8 @@ static void __init bhi_apply_mitigation(void); static void __init its_select_mitigation(void); static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); +static void __init tsa_select_mitigation(void); +static void __init tsa_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -113,10 +115,9 @@ void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; static void __init set_return_thunk(void *thunk) { - if (x86_return_thunk != __x86_return_thunk) - pr_warn("x86/bugs: return thunk changed\n"); - x86_return_thunk = thunk; + + pr_info("active return thunk: %ps\n", thunk); } /* Update SPEC_CTRL MSR and its cached copy unconditionally */ @@ -169,9 +170,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); -/* Control MDS CPU buffer clear before idling (halt, mwait) */ -DEFINE_STATIC_KEY_FALSE(mds_idle_clear); -EXPORT_SYMBOL_GPL(mds_idle_clear); +/* Control CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear); +EXPORT_SYMBOL_GPL(cpu_buf_idle_clear); /* * Controls whether l1d flush based mitigations are enabled, @@ -188,6 +189,39 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); +#undef pr_fmt +#define pr_fmt(fmt) "mitigations: " fmt + +static void __init cpu_print_attack_vectors(void) +{ + pr_info("Enabled attack vectors: "); + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) + pr_cont("user_kernel, "); + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) + pr_cont("user_user, "); + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) + pr_cont("guest_host, "); + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) + pr_cont("guest_guest, "); + + pr_cont("SMT mitigations: "); + + switch (smt_mitigations) { + case SMT_MITIGATIONS_OFF: + pr_cont("off\n"); + break; + case SMT_MITIGATIONS_AUTO: + pr_cont("auto\n"); + break; + case SMT_MITIGATIONS_ON: + pr_cont("on\n"); + } +} + void __init cpu_select_mitigations(void) { /* @@ -208,6 +242,8 @@ void __init cpu_select_mitigations(void) x86_arch_cap_msr = x86_read_arch_cap_msr(); + cpu_print_attack_vectors(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -225,6 +261,7 @@ void __init cpu_select_mitigations(void) gds_select_mitigation(); its_select_mitigation(); bhi_select_mitigation(); + tsa_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -272,6 +309,7 @@ void __init cpu_select_mitigations(void) gds_apply_mitigation(); its_apply_mitigation(); bhi_apply_mitigation(); + tsa_apply_mitigation(); } /* @@ -329,6 +367,62 @@ static void x86_amd_ssb_disable(void) #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt +/* + * Returns true if vulnerability should be mitigated based on the + * selected attack vector controls. + * + * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst + */ +static bool __init should_mitigate_vuln(unsigned int bug) +{ + switch (bug) { + /* + * The only runtime-selected spectre_v1 mitigations in the kernel are + * related to SWAPGS protection on kernel entry. Therefore, protection + * is only required for the user->kernel attack vector. + */ + case X86_BUG_SPECTRE_V1: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL); + + case X86_BUG_SPECTRE_V2: + case X86_BUG_RETBLEED: + case X86_BUG_SRSO: + case X86_BUG_L1TF: + case X86_BUG_ITS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + + case X86_BUG_SPECTRE_V2_USER: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST); + + /* + * All the vulnerabilities below allow potentially leaking data + * across address spaces. Therefore, mitigation is required for + * any of these 4 attack vectors. + */ + case X86_BUG_MDS: + case X86_BUG_TAA: + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_RFDS: + case X86_BUG_SRBDS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST); + + case X86_BUG_GDS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) || + (smt_mitigations != SMT_MITIGATIONS_OFF); + default: + WARN(1, "Unknown bug %x\n", bug); + return false; + } +} + /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF; @@ -382,13 +476,17 @@ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; static void __init mds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MDS)) { mds_mitigation = MDS_MITIGATION_OFF; return; } - if (mds_mitigation == MDS_MITIGATION_AUTO) - mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_MDS)) + mds_mitigation = MDS_MITIGATION_FULL; + else + mds_mitigation = MDS_MITIGATION_OFF; + } if (mds_mitigation == MDS_MITIGATION_OFF) return; @@ -398,7 +496,7 @@ static void __init mds_select_mitigation(void) static void __init mds_update_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_MDS)) return; /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ @@ -419,7 +517,7 @@ static void __init mds_apply_mitigation(void) mds_mitigation == MDS_MITIGATION_VMWERV) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && - (mds_nosmt || cpu_mitigations_auto_nosmt())) + (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); } } @@ -475,12 +573,13 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) - taa_mitigation = TAA_MITIGATION_OFF; - /* Microcode will be checked in taa_update_mitigation(). */ - if (taa_mitigation == TAA_MITIGATION_AUTO) - taa_mitigation = TAA_MITIGATION_VERW; + if (taa_mitigation == TAA_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_TAA)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_OFF; + } if (taa_mitigation != TAA_MITIGATION_OFF) verw_clear_cpu_buf_mitigation_selected = true; @@ -488,7 +587,7 @@ static void __init taa_select_mitigation(void) static void __init taa_update_mitigation(void) { - if (!taa_vulnerable() || cpu_mitigations_off()) + if (!taa_vulnerable()) return; if (verw_clear_cpu_buf_mitigation_selected) @@ -529,7 +628,7 @@ static void __init taa_apply_mitigation(void) */ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - if (taa_nosmt || cpu_mitigations_auto_nosmt()) + if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) cpu_smt_disable(false); } } @@ -575,8 +674,12 @@ static void __init mmio_select_mitigation(void) } /* Microcode will be checked in mmio_update_mitigation(). */ - if (mmio_mitigation == MMIO_MITIGATION_AUTO) - mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA)) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_OFF; + } if (mmio_mitigation == MMIO_MITIGATION_OFF) return; @@ -591,7 +694,7 @@ static void __init mmio_select_mitigation(void) static void __init mmio_update_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) return; if (verw_clear_cpu_buf_mitigation_selected) @@ -637,9 +740,9 @@ static void __init mmio_apply_mitigation(void) * is required irrespective of SMT state. */ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) - static_branch_enable(&mds_idle_clear); + static_branch_enable(&cpu_buf_idle_clear); - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) cpu_smt_disable(false); } @@ -680,13 +783,17 @@ static inline bool __init verw_clears_cpu_reg_file(void) static void __init rfds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_RFDS)) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } - if (rfds_mitigation == RFDS_MITIGATION_AUTO) - rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_RFDS)) + rfds_mitigation = RFDS_MITIGATION_VERW; + else + rfds_mitigation = RFDS_MITIGATION_OFF; + } if (rfds_mitigation == RFDS_MITIGATION_OFF) return; @@ -697,7 +804,7 @@ static void __init rfds_select_mitigation(void) static void __init rfds_update_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_RFDS)) return; if (verw_clear_cpu_buf_mitigation_selected) @@ -798,13 +905,19 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) { srbds_mitigation = SRBDS_MITIGATION_OFF; return; } - if (srbds_mitigation == SRBDS_MITIGATION_AUTO) - srbds_mitigation = SRBDS_MITIGATION_FULL; + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_SRBDS)) + srbds_mitigation = SRBDS_MITIGATION_FULL; + else { + srbds_mitigation = SRBDS_MITIGATION_OFF; + return; + } + } /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -952,12 +1065,15 @@ static void __init gds_select_mitigation(void) return; } - if (cpu_mitigations_off()) - gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ - - if (gds_mitigation == GDS_MITIGATION_AUTO) - gds_mitigation = GDS_MITIGATION_FULL; + if (gds_mitigation == GDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_GDS)) + gds_mitigation = GDS_MITIGATION_FULL; + else { + gds_mitigation = GDS_MITIGATION_OFF; + return; + } + } /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { @@ -1063,13 +1179,16 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1)) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; } static void __init spectre_v1_apply_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) return; if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { @@ -1120,6 +1239,20 @@ early_param("nospectre_v1", nospectre_v1_cmdline); enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +/* Depends on spectre_v2 mitigation selected already */ +static inline bool cdt_possible(enum spectre_v2_mitigation mode) +{ + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) || + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) + return false; + + if (mode == SPECTRE_V2_RETPOLINE || + mode == SPECTRE_V2_EIBRS_RETPOLINE) + return true; + + return false; +} + #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt @@ -1158,6 +1291,21 @@ static enum retbleed_mitigation retbleed_mitigation __ro_after_init = static int __ro_after_init retbleed_nosmt = false; +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_NOSMT, + SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, + SRSO_MITIGATION_BP_SPEC_REDUCE, +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; + static int __init retbleed_parse_cmdline(char *str) { if (!str) @@ -1200,7 +1348,7 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) { retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; } @@ -1237,6 +1385,11 @@ static void __init retbleed_select_mitigation(void) if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) return; + if (!should_mitigate_vuln(X86_BUG_RETBLEED)) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + return; + } + /* Intel mitigation selected in retbleed_update_mitigation() */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { @@ -1247,35 +1400,36 @@ static void __init retbleed_select_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; else retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + /* Final mitigation depends on spectre-v2 selection */ + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + else if (boot_cpu_has(X86_FEATURE_IBRS)) + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } } static void __init retbleed_update_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) return; - if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) - goto out; + /* ITS can also enable stuffing */ + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - /* - * retbleed=stuff is only allowed on Intel. If stuffing can't be used - * then a different mitigation will be selected below. - * - * its=stuff will also attempt to enable stuffing. - */ - if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || - its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { - if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - retbleed_mitigation = RETBLEED_MITIGATION_AUTO; - } else { - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_info("Retbleed mitigation updated to stuffing\n"); + /* If SRSO is using IBPB, that works for retbleed too */ + if (srso_mitigation == SRSO_MITIGATION_IBPB) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - } + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF && + !cdt_possible(spectre_v2_enabled)) { + pr_err("WARNING: retbleed=stuff depends on retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } + /* * Let IBRS trump all on Intel without affecting the effects of the * retbleed= cmdline option except for call depth based stuffing @@ -1294,15 +1448,11 @@ static void __init retbleed_update_mitigation(void) if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) pr_err(RETBLEED_INTEL_MSG); } - /* If nothing has set the mitigation yet, default to NONE. */ - if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) - retbleed_mitigation = RETBLEED_MITIGATION_NONE; } -out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } - static void __init retbleed_apply_mitigation(void) { bool mitigate_smt = false; @@ -1358,7 +1508,7 @@ static void __init retbleed_apply_mitigation(void) } if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && - (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); } @@ -1403,13 +1553,17 @@ early_param("indirect_target_selection", its_parse_cmdline); static void __init its_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_ITS)) { its_mitigation = ITS_MITIGATION_OFF; return; } - if (its_mitigation == ITS_MITIGATION_AUTO) - its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + if (its_mitigation == ITS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_ITS)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + else + its_mitigation = ITS_MITIGATION_OFF; + } if (its_mitigation == ITS_MITIGATION_OFF) return; @@ -1440,15 +1594,17 @@ static void __init its_select_mitigation(void) static void __init its_update_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_ITS)) return; switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: - pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + if (its_mitigation != ITS_MITIGATION_OFF) + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); its_mitigation = ITS_MITIGATION_OFF; break; case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: /* Retpoline+CDT mitigates ITS */ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; @@ -1462,13 +1618,8 @@ static void __init its_update_mitigation(void) break; } - /* - * retbleed_update_mitigation() will try to do stuffing if its=stuff. - * If it can't, such as if spectre_v2!=retpoline, then fall back to - * aligned thunks. - */ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && - retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + !cdt_possible(spectre_v2_enabled)) its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; pr_info("%s\n", its_strings[its_mitigation]); @@ -1476,15 +1627,127 @@ static void __init its_update_mitigation(void) static void __init its_apply_mitigation(void) { - /* its=stuff forces retbleed stuffing and is enabled there. */ - if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) + switch (its_mitigation) { + case ITS_MITIGATION_OFF: + case ITS_MITIGATION_AUTO: + case ITS_MITIGATION_VMEXIT_ONLY: + break; + case ITS_MITIGATION_ALIGNED_THUNKS: + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + case ITS_MITIGATION_RETPOLINE_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + set_return_thunk(call_depth_return_thunk); + break; + } +} + +#undef pr_fmt +#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt + +enum tsa_mitigations { + TSA_MITIGATION_NONE, + TSA_MITIGATION_AUTO, + TSA_MITIGATION_UCODE_NEEDED, + TSA_MITIGATION_USER_KERNEL, + TSA_MITIGATION_VM, + TSA_MITIGATION_FULL, +}; + +static const char * const tsa_strings[] = { + [TSA_MITIGATION_NONE] = "Vulnerable", + [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary", + [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM", + [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", +}; + +static enum tsa_mitigations tsa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE; + +static int __init tsa_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + tsa_mitigation = TSA_MITIGATION_NONE; + else if (!strcmp(str, "on")) + tsa_mitigation = TSA_MITIGATION_FULL; + else if (!strcmp(str, "user")) + tsa_mitigation = TSA_MITIGATION_USER_KERNEL; + else if (!strcmp(str, "vm")) + tsa_mitigation = TSA_MITIGATION_VM; + else + pr_err("Ignoring unknown tsa=%s option.\n", str); + + return 0; +} +early_param("tsa", tsa_parse_cmdline); + +static void __init tsa_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_TSA)) { + tsa_mitigation = TSA_MITIGATION_NONE; return; + } + + if (tsa_mitigation == TSA_MITIGATION_AUTO) { + bool vm = false, uk = false; + + tsa_mitigation = TSA_MITIGATION_NONE; - if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) - setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) { + tsa_mitigation = TSA_MITIGATION_USER_KERNEL; + uk = true; + } + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) { + tsa_mitigation = TSA_MITIGATION_VM; + vm = true; + } - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - set_return_thunk(its_return_thunk); + if (uk && vm) + tsa_mitigation = TSA_MITIGATION_FULL; + } + + if (tsa_mitigation == TSA_MITIGATION_NONE) + return; + + if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR)) + tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED; + + /* + * No need to set verw_clear_cpu_buf_mitigation_selected - it + * doesn't fit all cases here and it is not needed because this + * is the only VERW-based mitigation on AMD. + */ + pr_info("%s\n", tsa_strings[tsa_mitigation]); +} + +static void __init tsa_apply_mitigation(void) +{ + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + break; + case TSA_MITIGATION_VM: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + break; + case TSA_MITIGATION_FULL: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + break; + default: + break; + } } #undef pr_fmt @@ -1609,7 +1872,7 @@ static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) char arg[20]; int ret, i; - if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) + if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2_user", @@ -1647,6 +1910,13 @@ static void __init spectre_v2_user_select_mitigation(void) spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER)) + break; + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + if (smt_mitigations == SMT_MITIGATIONS_OFF) + break; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL: spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; @@ -1798,8 +2068,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) int ret, i; cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || - cpu_mitigations_off()) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -2002,11 +2271,20 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { - if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + if (!boot_cpu_has(X86_BUG_BHI)) bhi_mitigation = BHI_MITIGATION_OFF; - if (bhi_mitigation == BHI_MITIGATION_AUTO) - bhi_mitigation = BHI_MITIGATION_ON; + if (bhi_mitigation != BHI_MITIGATION_AUTO) + return; + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) { + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) + bhi_mitigation = BHI_MITIGATION_ON; + else + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY; + } else { + bhi_mitigation = BHI_MITIGATION_OFF; + } } static void __init bhi_update_mitigation(void) @@ -2062,8 +2340,11 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_NONE: return; - case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2)) + break; + fallthrough; + case SPECTRE_V2_CMD_FORCE: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { spectre_v2_enabled = SPECTRE_V2_EIBRS; break; @@ -2117,7 +2398,7 @@ static void __init spectre_v2_update_mitigation(void) } } - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); } @@ -2249,10 +2530,10 @@ static void update_mds_branch_idle(void) return; if (sched_smt_active()) { - static_branch_enable(&mds_idle_clear); + static_branch_enable(&cpu_buf_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { - static_branch_disable(&mds_idle_clear); + static_branch_disable(&cpu_buf_idle_clear); } } @@ -2316,6 +2597,25 @@ void cpu_bugs_smt_update(void) break; } + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -2750,17 +3050,23 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { l1tf_mitigation = L1TF_MITIGATION_OFF; return; } - if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { - if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - else - l1tf_mitigation = L1TF_MITIGATION_FLUSH; + if (l1tf_mitigation != L1TF_MITIGATION_AUTO) + return; + + if (!should_mitigate_vuln(X86_BUG_L1TF)) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; } + + if (smt_mitigations == SMT_MITIGATIONS_ON) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; } static void __init l1tf_apply_mitigation(void) @@ -2834,31 +3140,18 @@ early_param("l1tf", l1tf_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt -enum srso_mitigation { - SRSO_MITIGATION_NONE, - SRSO_MITIGATION_AUTO, - SRSO_MITIGATION_UCODE_NEEDED, - SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, - SRSO_MITIGATION_MICROCODE, - SRSO_MITIGATION_SAFE_RET, - SRSO_MITIGATION_IBPB, - SRSO_MITIGATION_IBPB_ON_VMEXIT, - SRSO_MITIGATION_BP_SPEC_REDUCE, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only", [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; - static int __init srso_parse_cmdline(char *str) { if (!str) @@ -2885,35 +3178,44 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode; - - if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { srso_mitigation = SRSO_MITIGATION_NONE; - - if (srso_mitigation == SRSO_MITIGATION_NONE) return; + } - if (srso_mitigation == SRSO_MITIGATION_AUTO) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - - has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (has_microcode) { - /* - * Zen1/2 with SMT off aren't vulnerable after the right - * IBPB microcode has been applied. - */ - if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { - setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + if (srso_mitigation == SRSO_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_SRSO)) { + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else { srso_mitigation = SRSO_MITIGATION_NONE; return; } - } else { + } + + /* Zen1/2 with SMT off aren't vulnerable to SRSO. */ + if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { + srso_mitigation = SRSO_MITIGATION_NOSMT; + return; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); + + /* + * Safe-RET provides partial mitigation without microcode, but + * other mitigations require microcode to provide any + * mitigations. + */ + if (srso_mitigation == SRSO_MITIGATION_SAFE_RET) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; + else + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_mitigation) { case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; goto ibpb_on_vmexit; @@ -2923,9 +3225,6 @@ static void __init srso_select_mitigation(void) pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); srso_mitigation = SRSO_MITIGATION_NONE; } - - if (!has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; break; ibpb_on_vmexit: case SRSO_MITIGATION_IBPB_ON_VMEXIT: @@ -2940,9 +3239,6 @@ ibpb_on_vmexit: pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); srso_mitigation = SRSO_MITIGATION_NONE; } - - if (!has_microcode) - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; break; default: break; @@ -2957,8 +3253,7 @@ static void __init srso_update_mitigation(void) srso_mitigation = SRSO_MITIGATION_IBPB; if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off() && - !boot_cpu_has(X86_FEATURE_SRSO_NO)) + !cpu_mitigations_off()) pr_info("%s\n", srso_strings[srso_mitigation]); } @@ -3254,9 +3549,6 @@ static ssize_t retbleed_show_state(char *buf) static ssize_t srso_show_state(char *buf) { - if (boot_cpu_has(X86_FEATURE_SRSO_NO)) - return sysfs_emit(buf, "Mitigation: SMT disabled\n"); - return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } @@ -3265,6 +3557,11 @@ static ssize_t gds_show_state(char *buf) return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); } +static ssize_t tsa_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3328,6 +3625,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITS: return its_show_state(buf); + case X86_BUG_TSA: + return tsa_show_state(buf); + default: break; } @@ -3414,6 +3714,11 @@ ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_att { return cpu_show_common(dev, attr, buf, X86_BUG_ITS); } + +ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TSA); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8feb8fd2957a..34a054181c4d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -26,6 +26,7 @@ #include <linux/pgtable.h> #include <linux/stackprotector.h> #include <linux/utsname.h> +#include <linux/efi.h> #include <asm/alternative.h> #include <asm/cmdline.h> @@ -1233,6 +1234,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define ITS BIT(8) /* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ #define ITS_NATIVE_ONLY BIT(9) +/* CPU is affected by Transient Scheduler Attacks */ +#define TSA BIT(10) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1280,7 +1283,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x16, RETBLEED), VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO), + VULNBL_AMD(0x19, SRSO | TSA), VULNBL_AMD(0x1a, SRSO), {} }; @@ -1530,6 +1533,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); } + if (c->x86_vendor == X86_VENDOR_AMD) { + if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) || + !cpu_has(c, X86_FEATURE_TSA_L1_NO)) { + if (cpu_matches(cpu_vuln_blacklist, TSA) || + /* Enable bug on Zen guests to allow for live migration. */ + (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN))) + setup_force_cpu_bug(X86_BUG_TSA); + } + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -2243,20 +2256,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) +static void initialize_debug_regs(void) { - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } + /* Control register first -- to make sure everything is disabled. */ + set_debugreg(DR7_FIXED_1, 7); + set_debugreg(DR6_RESERVED, 6); + /* dr5 and dr4 don't exist */ + set_debugreg(0, 3); + set_debugreg(0, 2); + set_debugreg(0, 1); + set_debugreg(0, 0); } #ifdef CONFIG_KGDB @@ -2417,7 +2426,7 @@ void cpu_init(void) load_mm_ldt(&init_mm); - clear_all_debug_regs(); + initialize_debug_regs(); dbg_restore_debug_regs(); doublefault_init_cpu_tss(); @@ -2530,6 +2539,12 @@ void __init arch_cpu_finalize_init(void) fpu__init_cpu(); /* + * This needs to follow the FPU initializtion, since EFI depends on it. + */ + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); + + /* * Ensure that access to the per CPU representation has the initial * boot CPU configuration. */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9d852c3b2cb5..5c4eb28c3ac9 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -350,7 +350,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) struct thresh_restart { struct threshold_block *b; - int reset; int set_lvt_off; int lvt_off; u16 old_limit; @@ -432,13 +431,13 @@ static void threshold_restart_bank(void *_tr) rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) - tr->reset = 1; /* limit cannot be lower than err count */ - - if (tr->reset) { /* reset err count and overflow bit */ - hi = - (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - tr->b->threshold_limit); + /* + * Reset error count and overflow bit. + * This is done during init or after handling an interrupt. + */ + if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) { + hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI); + hi |= THRESHOLD_MAX - tr->b->threshold_limit; } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); @@ -1113,13 +1112,20 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol } bank_type = smca_get_bank_type(cpu, bank); - if (bank_type >= N_SMCA_BANK_TYPES) - return NULL; if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; - return NULL; + } + + if (b && b->block) { + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block); + return buf_mcatype; + } + + if (bank_type >= N_SMCA_BANK_TYPES) { + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank); + return buf_mcatype; } if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e9b3c5d4a52e..4da4eab56c81 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1740,6 +1740,11 @@ static void mc_poll_banks_default(void) void (*mc_poll_banks)(void) = mc_poll_banks_default; +static bool should_enable_timer(unsigned long iv) +{ + return !mca_cfg.ignore_ce && iv; +} + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1763,7 +1768,7 @@ static void mce_timer_fn(struct timer_list *t) if (mce_get_storm_mode()) { __start_timer(t, HZ); - } else { + } else if (should_enable_timer(iv)) { __this_cpu_write(mce_next_interval, iv); __start_timer(t, iv); } @@ -2156,11 +2161,10 @@ static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (should_enable_timer(iv)) { + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } static void __mcheck_cpu_setup_timer(void) @@ -2801,15 +2805,9 @@ static int mce_cpu_dead(unsigned int cpu) static int mce_cpu_online(unsigned int cpu) { struct timer_list *t = this_cpu_ptr(&mce_timer); - int ret; mce_device_create(cpu); - - ret = mce_threshold_create_device(cpu); - if (ret) { - mce_device_remove(cpu); - return ret; - } + mce_threshold_create_device(cpu); mce_reenable_cpu(); mce_start_timer(t); return 0; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index efcf21e9552e..9b149b9c4109 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -478,6 +478,7 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) void mce_intel_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); + cmci_clear(); } bool intel_filter_mce(struct mce *m) diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c index 2a1655b1fdd8..1fd349cfc802 100644 --- a/arch/x86/kernel/cpu/microcode/amd_shas.c +++ b/arch/x86/kernel/cpu/microcode/amd_shas.c @@ -231,6 +231,13 @@ static const struct patch_digest phashes[] = { 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21, } }, + { 0xa0011d7, { + 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b, + 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12, + 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2, + 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36, + } + }, { 0xa001223, { 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8, 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4, @@ -294,6 +301,13 @@ static const struct patch_digest phashes[] = { 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59, } }, + { 0xa00123b, { + 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2, + 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3, + 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28, + 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72, + } + }, { 0xa00820c, { 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3, 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63, @@ -301,6 +315,13 @@ static const struct patch_digest phashes[] = { 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2, } }, + { 0xa00820d, { + 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4, + 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca, + 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1, + 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7, + } + }, { 0xa10113e, { 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10, 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0, @@ -322,6 +343,13 @@ static const struct patch_digest phashes[] = { 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4, } }, + { 0xa10114c, { + 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64, + 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74, + 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a, + 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0, + } + }, { 0xa10123e, { 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18, 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d, @@ -343,6 +371,13 @@ static const struct patch_digest phashes[] = { 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75, } }, + { 0xa10124c, { + 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90, + 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46, + 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc, + 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2, + } + }, { 0xa108108, { 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9, 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6, @@ -350,6 +385,13 @@ static const struct patch_digest phashes[] = { 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16, } }, + { 0xa108109, { + 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa, + 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b, + 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35, + 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59, + } + }, { 0xa20102d, { 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11, 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89, @@ -357,6 +399,13 @@ static const struct patch_digest phashes[] = { 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4, } }, + { 0xa20102e, { + 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd, + 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6, + 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5, + 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe, + } + }, { 0xa201210, { 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe, 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9, @@ -364,6 +413,13 @@ static const struct patch_digest phashes[] = { 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41, } }, + { 0xa201211, { + 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95, + 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27, + 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff, + 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27, + } + }, { 0xa404107, { 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45, 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0, @@ -371,6 +427,13 @@ static const struct patch_digest phashes[] = { 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99, } }, + { 0xa404108, { + 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc, + 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb, + 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19, + 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00, + } + }, { 0xa500011, { 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4, 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1, @@ -378,6 +441,13 @@ static const struct patch_digest phashes[] = { 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74, } }, + { 0xa500012, { + 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4, + 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16, + 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f, + 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63, + } + }, { 0xa601209, { 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32, 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30, @@ -385,6 +455,13 @@ static const struct patch_digest phashes[] = { 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d, } }, + { 0xa60120a, { + 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d, + 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b, + 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d, + 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c, + } + }, { 0xa704107, { 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6, 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93, @@ -392,6 +469,13 @@ static const struct patch_digest phashes[] = { 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39, } }, + { 0xa704108, { + 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93, + 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98, + 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49, + 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a, + } + }, { 0xa705206, { 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4, 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7, @@ -399,6 +483,13 @@ static const struct patch_digest phashes[] = { 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc, } }, + { 0xa705208, { + 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19, + 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2, + 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11, + 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff, + } + }, { 0xa708007, { 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3, 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2, @@ -406,6 +497,13 @@ static const struct patch_digest phashes[] = { 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93, } }, + { 0xa708008, { + 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46, + 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab, + 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16, + 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b, + } + }, { 0xa70c005, { 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b, 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f, @@ -413,6 +511,13 @@ static const struct patch_digest phashes[] = { 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13, } }, + { 0xa70c008, { + 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21, + 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e, + 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66, + 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0, + } + }, { 0xaa00116, { 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63, 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5, @@ -441,4 +546,11 @@ static const struct patch_digest phashes[] = { 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef, } }, + { 0xaa00216, { + 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5, + 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08, + 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2, + 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1, + } + }, }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index fe50eb5b7c4a..b92e09a87c69 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -17,8 +17,8 @@ #define pr_fmt(fmt) "microcode: " fmt -#include <linux/platform_device.h> #include <linux/stop_machine.h> +#include <linux/device/faux.h> #include <linux/syscore_ops.h> #include <linux/miscdevice.h> #include <linux/capability.h> @@ -249,7 +249,7 @@ static void reload_early_microcode(unsigned int cpu) } /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +static struct faux_device *microcode_fdev; #ifdef CONFIG_MICROCODE_LATE_LOADING /* @@ -690,7 +690,7 @@ static int load_late_locked(void) if (!setup_cpus()) return -EBUSY; - switch (microcode_ops->request_microcode_fw(0, µcode_pdev->dev)) { + switch (microcode_ops->request_microcode_fw(0, µcode_fdev->dev)) { case UCODE_NEW: return load_late_stop_cpus(false); case UCODE_NEW_SAFE: @@ -841,9 +841,9 @@ static int __init microcode_init(void) if (early_data.new_rev) pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev); - microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); - if (IS_ERR(microcode_pdev)) - return PTR_ERR(microcode_pdev); + microcode_fdev = faux_device_create("microcode", NULL, NULL); + if (!microcode_fdev) + return -ENODEV; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { @@ -862,7 +862,7 @@ static int __init microcode_init(void) return 0; out_pdev: - platform_device_unregister(microcode_pdev); + faux_device_destroy(microcode_fdev); return error; } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e2c6b471d230..8c18327eb10b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -593,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 0c13b0befd8a..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -2,4 +2,6 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d987b11c168c..187d527ef73b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -61,7 +61,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", .ctrl_scope = RESCTRL_L3_CACHE, .mon_scope = RESCTRL_L3_CACHE, @@ -75,7 +74,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), @@ -87,7 +85,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), @@ -97,7 +94,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), @@ -165,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -517,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; + struct cacheinfo *ci; int err; lockdep_assert_held(&domain_list_lock); @@ -544,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!d->ci) { + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); mon_domain_free(hw_dom); return; } + d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); arch_mon_domain_online(r, d); @@ -738,7 +721,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -778,7 +761,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -798,7 +781,7 @@ bool __init rdt_cpu_has(int flag) return ret; } -__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { if (!rdt_cpu_has(X86_FEATURE_BMEC)) return false; @@ -1012,7 +995,11 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 0a0ac5f6112e..1189c0df4ad7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,277 +16,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - -typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - int ret; - u32 bw; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtou32(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Invalid MB value %s\n", buf); - return false; - } - - /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { - *data = bw; - return true; - } - - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - u32 bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit, val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - ctrlval_parser_t *parse_ctrlval = NULL; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - struct rdt_ctrl_domain *d; - char *dom = NULL, *id; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - parse_ctrlval = &parse_cbm; - break; - case RESCTRL_SCHEMA_RANGE: - parse_ctrlval = &parse_bw; - break; - } - - if (WARN_ON_ONCE(!parse_ctrlval)) - return -EINVAL; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { @@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -453,276 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } - -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_ctrl_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->hdr.id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; -} - -static int smp_mon_event_count(void *arg) -{ - mon_event_count(arg); - - return 0; -} - -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else - ret = -EINVAL; - } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (ret) - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); - - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - switch (rdtgrp->mba_mbps_event) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - seq_puts(s, "mbm_local_bytes\n"); - break; - case QOS_L3_MBM_TOTAL_EVENT_ID: - seq_puts(s, "mbm_total_bytes\n"); - break; - default: - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); - ret = -EINVAL; - break; - } - } else { - ret = -ENOENT; - } - - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) -{ - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(cpumask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); -} - -int rdtgroup_mondata_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rdt_domain_hdr *hdr; - struct rmid_read rr = {0}; - struct rdt_mon_domain *d; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - r = resctrl_arch_get_resource(resid); - - if (md.u.sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * mon_data_bits union. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; - mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); - goto checkresult; - } - } - ret = -ENOENT; - goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); - } - -checkresult: - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; -} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index eaae99602b61..5e3c41b36437 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -33,278 +26,6 @@ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) /** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - /* Only continue if tick_nohz_full_mask has been initialized. */ - if (!tick_nohz_full_enabled()) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file. - * @priv: Used to store monitoring event data in @u - * as kernfs private data. - * @u.rid: Resource id associated with the event file. - * @u.evtid: Event id associated with the event file. - * @u.sum: Set when event must be summed across multiple - * domains. - * @u.domid: When @u.sum is zero this is the domain to which - * the event file belongs. When @sum is one this - * is the id of the L3 cache that all domains to be - * summed share. - * @u: Name of the bit fields struct. - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 7; - unsigned int sum : 1; - unsigned int domid : 14; - } u; -}; - -/** - * struct rmid_read - Data passed across smp_call*() to read event count. - * @rgrp: Resource group for which the counter is being read. If it is a parent - * resource group then its event count is summed with the count from all - * its child resource groups. - * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. - * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. - * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). - */ -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; - bool first; - struct cacheinfo *ci; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @mba_mbps_event: input monitoring event id when mba_sc is enabled - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - enum resctrl_event_id mba_mbps_event; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width; - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - -/** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) @@ -401,24 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -extern struct mutex rdtgroup_mutex; - -static inline const char *rdt_kn_name(const struct kernfs_node *kn) -{ - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); -} - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; -extern enum resctrl_event_id mba_mbps_default_event; - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); @@ -455,99 +159,14 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); - void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); -int rdt_get_mon_l3_config(struct rdt_resource *r); -void resctrl_mon_resource_exit(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); -int __init resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); -void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); -void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void resctrl_file_fflags_init(const char *config, unsigned long fflags); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); - -#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -#else -static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} -static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} - -static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - return false; -} +int rdt_get_mon_l3_config(struct rdt_resource *r); -static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - return false; -} +bool rdt_cpu_has(int flag); -static inline int rdt_pseudo_lock_init(void) { return 0; } -static inline void rdt_pseudo_lock_release(void) { } -static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} +void __init intel_rdt_mbm_apply_quirk(void); -static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } -#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 591b0b44d260..c261558276cd 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -18,63 +18,12 @@ #define pr_fmt(fmt) "resctrl: " fmt #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> #include <asm/msr.h> -#include <asm/resctrl.h> #include "internal.h" -#include "trace.h" - -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; /* * Global boolean for rdt_monitor which is true if any @@ -87,23 +36,12 @@ bool rdt_mon_capable; */ unsigned int rdt_mon_features; -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -152,6 +90,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -164,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. - * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. - */ -static inline struct rmid_entry *__rmid_entry(u32 idx) -{ - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); - - return entry; -} - -/* * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is * needed. The physical RMID is the same as the logical RMID. @@ -261,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return &hw_dom->arch_mbm_total[rmid]; case QOS_L3_MBM_LOCAL_EVENT_ID: return &hw_dom->arch_mbm_local[rmid]; + default: + /* Never expect to get here */ + WARN_ON_ONCE(1); + return NULL; } - - /* Never expect to get here */ - WARN_ON_ONCE(1); - - return NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -347,769 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return 0; } -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - -/* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID - */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - - /* - * x86's CLOSID and RMID are independent numbers, so the entry's - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't - * used to select the configuration. It is thus necessary to track both - * CLOSID and RMID because there may be dependencies between them - * on some architectures. - */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; -} - -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (!resctrl_arch_mon_capable() || - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (resctrl_arch_is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - int cpu = smp_processor_id(); - struct rdt_mon_domain *d; - struct mbm_state *m; - int err, ret; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; - } - - /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) - return -EINVAL; - - /* - * Legacy files must report the sum of an event across all - * domains that share the same L3 cache instance. - * Report success if a read from any domain succeeds, -EINVAL - * (translated to "Unavailable" for user space) if reading from - * all domains fail for any reason. - */ - ret = -EINVAL; - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) - continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (!err) { - rr->val += tval; - ret = 0; - } - } - - if (ret) - rr->err = ret; - - return ret; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u64 cur_bw, bytes, cur_bytes; - struct mbm_state *m; - - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (WARN_ON_ONCE(!m)) - return; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} - -/* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. - */ -void mon_event_count(void *info) -{ - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; - int ret; - - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_ctrl_domain *dom_mba; - enum resctrl_event_id evt_id; - struct rdt_resource *r_mba; - struct list_head *head; - struct rdtgroup *entry; - u32 cur_bw, user_bw; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - evt_id = rgrp->mba_mbps_event; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); - if (WARN_ON_ONCE(!pmbm_data)) - return; - - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); - if (WARN_ON_ONCE(!cmbm_data)) - return; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) -{ - struct rmid_read rr = {0}; - - rr.r = r; - rr.d = d; - rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * If the software controller is enabled, compute the - * bandwidth for this event id. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) -{ - /* - * This is protected from concurrent reads from user as both - * the user and overflow handler hold the global mutex. - */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); - - __check_limbo(d, false); - - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); - } - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); -} - -void mbm_handle_overflow(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; - struct list_head *head; - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } - - /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. - */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); -} - -static int dom_data_init(struct rdt_resource *r) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); - - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); - } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -static void dom_data_exit(struct rdt_resource *r) -{ - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - - kfree(rmid_ptrs); - rmid_ptrs = NULL; - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); -} - /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. @@ -1193,51 +341,6 @@ static __init int snc_get_config(void) return ret; } -/** - * resctrl_mon_resource_init() - Initialise global monitoring structures. - * - * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. - * Called once during boot after the struct rdt_resource's have been configured - * but before the filesystem is mounted. - * Resctrl's cpuhp callbacks may be called before this point to bring a domain - * online. - * - * Returns 0 for success, or -ENOMEM. - */ -int __init resctrl_mon_resource_init(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; - - if (!r->mon_capable) - return 0; - - ret = dom_data_init(r); - if (ret) - return ret; - - l3_mon_evt_init(r); - - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; - resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; - resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - - if (resctrl_arch_is_mbm_local_enabled()) - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; - - return 0; -} - int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1285,13 +388,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } -void resctrl_mon_resource_exit(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - - dom_data_exit(r); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 1190c48a16b2..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,19 +11,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> #include <asm/perf_event.h> #include <asm/msr.h> @@ -31,7 +25,8 @@ #include "internal.h" #define CREATE_TRACE_POINTS -#include "trace.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -39,29 +34,6 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - guard(mutex)(&rdtgroup_mutex); - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported * platforms @@ -123,298 +95,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void) } /** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - enum resctrl_scope scope = plr->s->res->ctrl_scope; - struct cacheinfo *ci; - int ret; - - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) - return -ENODEV; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo_level(plr->cpu, scope); - if (ci) { - plr->line_size = ci->coherency_line_size; - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - return 0; - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; - } - - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; -} - -/** * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache * @_plr: the pseudo-lock region descriptor * @@ -544,340 +224,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - if (resctrl_arch_get_prefetch_disable_bits() == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - struct rdt_ctrl_domain *d_i; - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->hdr.cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read * pseudo-locked memory * @_plr: pseudo-lock region to measure @@ -1169,433 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->hdr.cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 2) - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 3) - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - char *kn_name __free(kfree) = NULL; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); - if (!kn_name) { - ret = -ENOMEM; - goto out_cstates; - } - - plr->thread_done = 0; - - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, - plr->cpu, "pseudo_lock/%u"); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", kn_name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 2a506316b303..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESCTRL_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -TRACE_EVENT(mon_llc_occupancy_limbo, - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), - TP_STRUCT__entry(__field(u32, ctrl_hw_id) - __field(u32, mon_hw_id) - __field(int, domain_id) - __field(u64, llc_occupancy_bytes)), - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; - __entry->mon_hw_id = mon_hw_id; - __entry->domain_id = domain_id; - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, - __entry->llc_occupancy_bytes) - ); - -#endif /* _TRACE_RESCTRL_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index c85ace29ea3a..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -29,341 +30,16 @@ #include <uapi/linux/magic.h> #include <asm/msr.h> -#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -/* - * Used to store the max resource name width to display the schemata names in - * a tabular format. - */ -int max_name_width; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -/* - * Memory bandwidth monitoring event to use for the default CTRL_MON group - * and each new CTRL_MON group created by the user. Only relevant when - * the filesystem is mounted with the "mba_MBps" option so it does not - * matter that it remains uninitialized on systems that do not support - * the "mba_MBps" option. - */ -enum resctrl_event_id mba_mbps_default_event; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_ctrl_domain *dom; - struct rdt_resource *r; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -static bool resctrl_is_mbm_enabled(void) -{ - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} - -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->hdr.cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. @@ -382,1223 +58,7 @@ void resctrl_arch_sync_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - * @r may be NULL. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - struct resctrl_cpu_defaults defaults, *p = NULL; - - if (r) { - defaults.closid = r->closid; - defaults.rmid = r->mon.rmid; - p = &defaults; - } - - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdt_kn_name(rdtg->kn)); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", rdt_kn_name(crg->kn)); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static void *rdt_kn_parent_priv(struct kernfs_node *kn) -{ - /* - * The parent pointer is only valid within RCU section since it can be - * replaced. - */ - guard(rcu)(); - return rcu_dereference(kn->__parent)->priv; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%u\n", s->num_closid); - return 0; -} - -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_ctrl_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->hdr.id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - switch (r->membw.throttle_mode) { - case THREAD_THROTTLE_PER_THREAD: - seq_puts(seq, "per-thread\n"); - return 0; - case THREAD_THROTTLE_MAX: - seq_puts(seq, "max\n"); - return 0; - case THREAD_THROTTLE_UNDEFINED: - seq_puts(seq, "undefined\n"); - return 0; - } - - WARN_ON_ONCE(1); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct rdt_ctrl_domain *d; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && - !strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int size = 0; - struct cacheinfo *ci; - int num_b; - - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) - return size; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); - if (ci) - size = ci->size / r->cache.cbm_len * num_b; - - return size; -} - -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdt_ctrl_domain *d; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->hdr.id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; + resctrl_arch_sched_in(current); } #define INVALID_CONFIG_INDEX UINT_MAX @@ -1642,62 +102,6 @@ void resctrl_arch_mon_event_config_read(void *_config_info) config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct resctrl_mon_config_info *mon_info) -{ - smp_call_function_any(&mon_info->d->hdr.cpu_mask, - resctrl_arch_mon_event_config_read, mon_info, 1); -} - -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) -{ - struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); - mon_info.r = r; - mon_info.d = dom; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; -} - void resctrl_arch_mon_event_config_write(void *_config_info) { struct resctrl_mon_config_info *config_info = _config_info; @@ -1711,618 +115,6 @@ void resctrl_arch_mon_event_config_write(void *_config_info) wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) -{ - struct resctrl_mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.r = r; - mon_info.d = d; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - if (mon_info.mon_config == val) - return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_mon_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->hdr.id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mba_MBps_event", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mba_mbps_event_write, - .seq_show = rdtgroup_mba_mbps_event_show, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -static void thread_throttle_mode_init(void) -{ - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; - struct rdt_resource *r_mba, *r_smba; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; - - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); - if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; - - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) - return; - - resctrl_file_fflags_init("thread_throttle_mode", - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); -} - -void resctrl_file_fflags_init(const char *config, unsigned long fflags) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = fflags; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static unsigned long fflags_from_resource(struct rdt_resource *r) -{ - switch (r->rid) { - case RDT_RESOURCE_L3: - case RDT_RESOURCE_L2: - return RFTYPE_RES_CACHE; - case RDT_RESOURCE_MBA: - case RDT_RESOURCE_SMBA: - return RFTYPE_RES_MB; - } - - return WARN_ON_ONCE(1); -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -2337,11 +129,6 @@ static void l2_qos_cfg_update(void *arg) wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } -static inline bool is_mba_linear(void) -{ - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; -} - static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); @@ -2397,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_ctrl_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale, - * and the MBM monitor scope is the same as MBA - * control scope. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - return (resctrl_is_mbm_enabled() && - r->alloc_capable && is_mba_linear() && - r->ctrl_scope == rmbm->mon_scope); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_ctrl_domain *d; - unsigned long fflags; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; - resctrl_file_fflags_init("mba_MBps_event", fflags); - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2507,419 +224,9 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || - rcu_access_pointer(kn->__parent) == kn_info) - return NULL; - else - return kn->priv; - } else { - return rdt_kn_parent_priv(kn); - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - s->fmt_str = "%d=%x"; - break; - case RESCTRL_SCHEMA_RANGE: - s->fmt_str = "%d=%u"; - break; - } - - if (WARN_ON_ONCE(!s->fmt_str)) { - kfree(s); - return -EINVAL; - } - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) -{ - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } -} - -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (resctrl_is_mbm_enabled()) { - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - list_for_each_entry(dom, &r->mon_domains, hdr.list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - const char *msg; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; - if (!supports_mba_mbps()) - return invalfc(fc, msg); - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); -} - -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; + return rdt_resources_all[l].cdp_enabled; } void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) @@ -2953,1460 +260,3 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) return; } - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /* Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - resctrl_arch_reset_all_ctrls(r); - - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get(pkn, name); - if (!kn) - return; - kernfs_put(kn); - - if (kn->dir.subdirs <= 1) - kernfs_remove(kn); - else - kernfs_remove_by_name(kn, subname); -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct rdtgroup *prgrp, *crgrp; - char subname[32]; - bool snc_mode; - char name[32]; - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); - } -} - -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) -{ - struct rmid_read rr = {0}; - union mon_data_bits priv; - struct mon_evt *mevt; - int ret; - - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - priv.u.rid = r->rid; - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; - priv.u.sum = do_sum; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - return ret; - - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); - } - - return 0; -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - struct kernfs_node *kn, *ckn; - char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - kn = kernfs_find_and_get(parent_kn, name); - if (kn) { - /* - * rdtgroup_mutex will prevent this directory from being - * removed. No need to keep this hold. - */ - kernfs_put(kn); - } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; - } - - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; - - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } - - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_mon_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; - } - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_ctrl_domain *d; - int ret; - - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_ctrl_domain *d; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - /* - * Check that the parent directory for a monitor group is a "mon_groups" - * directory. - */ - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { - ret = -EPERM; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - if (is_mba_sc(NULL)) - rdtgrp->mba_mbps_event = mba_mbps_default_event; - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* Else, attempt to add a monitoring subdirectory. */ - if (resctrl_arch_mon_capable()) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* - * Update per cpu closid/rmid of the moved CPUs first. - * Note: the closid will not change, but the arch code still needs it. - */ - closid = prdtgrp->closid; - rmid = prdtgrp->mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - closid = rdtgroup_default.closid; - rmid = rdtgroup_default.mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) -{ - /* - * Valid within the RCU section it was obtained or while rdtgroup_mutex - * is held. - */ - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - parent_kn = rdt_kn_parent(kn); - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, rdt_kn_name(kn))) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct kernfs_node *kn_parent; - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - kn_parent = rdt_kn_parent(kn); - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_mon_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - mutex_unlock(&rdtgroup_mutex); -} - -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); - - if (resctrl_is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - - mutex_unlock(&rdtgroup_mutex); -} - -/** - * domain_setup_mon_state() - Initialise domain monitoring structures. - * @r: The resource for the newly online domain. - * @d: The newly online domain. - * - * Allocate monitor resources that belong to this domain. - * Called when the first CPU of a domain comes online, regardless of whether - * the filesystem is mounted. - * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). - * - * Returns 0 for success, or -ENOMEM. - */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (resctrl_arch_is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - } - - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int err; - - mutex_lock(&rdtgroup_mutex); - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (resctrl_is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (resctrl_arch_is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_mon_domain_from_cpu(cpu, l3); - if (d) { - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (resctrl_arch_is_llc_occupancy_enabled() && - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * resctrl_init - resctrl filesystem initialization - * - * Setup resctrl file system including set up root, create mount point, - * register resctrl filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init resctrl_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - thread_throttle_mode_init(); - - ret = resctrl_mon_resource_init(); - if (ret) - return ret; - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) { - resctrl_mon_resource_exit(); - return ret; - } - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); - - return ret; -} - -void __exit resctrl_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - resctrl_mon_resource_exit(); -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index dbf6d71bdf18..b4a1f6732a3a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -50,6 +50,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, + { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 776a20172867..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6722b2fc82cf..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -720,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0be61c45400c..bcb534688dfe 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; + int ret; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; @@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude elf header region */ start = image->elf_load_addr; end = start + image->elf_headers_sz - 1; - return crash_exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); + + if (ret) + return ret; + + /* Exclude dm crypt keys region */ + if (image->dm_crypt_keys_addr) { + start = image->dm_crypt_keys_addr; + end = start + image->dm_crypt_keys_sz - 1; + return crash_exclude_mem_range(cmem, start, end); + } + + return ret; } /* Prepare memory map for crash dump kernel */ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { + unsigned int nr_ranges = 0; int i, ret = 0; unsigned long flags; struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(struct_size(cmem, ranges, 1)); + /* + * Using random kexec_buf for passing dm crypt keys may cause a range + * split. So use two slots here. + */ + nr_ranges = 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9920122018a0..c3acbd26408b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void) } /* + * At this point memblock is only allowed to allocate from memory + * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set + * up in init_mem_mapping(). + * + * KHO kernels are special and use only scratch memory for memblock + * allocations, but memory below 1M is ignored by kernel after early + * boot and cannot be naturally marked as scratch. + * + * To allow allocation of the real-mode trampoline and a few (if any) + * other very early allocations from below 1M forcibly mark the memory + * below 1M as scratch. + * + * After real mode trampoline is allocated, we clear that scratch + * marking. + */ + memblock_mark_kho_scratch(0, SZ_1M); + + /* * 32-bit systems are limited to 4BG of memory even with HIGHMEM and * to even less without it. * Discard memory after max_pfn - the actual limit detected at runtime. diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ea138583dd92..aefd412a23dc 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -37,6 +37,7 @@ DEFINE_PER_CPU(u64, xfd_state); /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; +struct vcpu_fpu_config guest_default_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, @@ -217,7 +218,7 @@ void fpu_reset_from_exception_fixup(void) } #if IS_ENABLED(CONFIG_KVM) -static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); +static void __fpstate_reset(struct fpstate *fpstate); static void fpu_lock_guest_permissions(void) { @@ -242,19 +243,21 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) struct fpstate *fpstate; unsigned int size; - size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); if (!fpstate) return false; - /* Leave xfd to 0 (the reset value defined by spec) */ - __fpstate_reset(fpstate, 0); - fpstate_init_user(fpstate); + /* Initialize indicators to reflect properties of the fpstate */ fpstate->is_valloc = true; fpstate->is_guest = true; + __fpstate_reset(fpstate); + fpstate_init_user(fpstate); + gfpu->fpstate = fpstate; - gfpu->xfeatures = fpu_kernel_cfg.default_features; + gfpu->xfeatures = guest_default_cfg.features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -541,28 +544,50 @@ void fpstate_init_user(struct fpstate *fpstate) fpstate_init_fstate(fpstate); } -static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) +static void __fpstate_reset(struct fpstate *fpstate) { - /* Initialize sizes and feature masks */ - fpstate->size = fpu_kernel_cfg.default_size; + /* + * Supervisor features (and thus sizes) may diverge between guest + * FPUs and host FPUs, as some supervisor features are supported + * for guests despite not being utilized by the host. User + * features and sizes are always identical, which allows for + * common guest and userspace ABI. + * + * For the host, set XFD to the kernel's desired initialization + * value. For guests, set XFD to its architectural RESET value. + */ + if (fpstate->is_guest) { + fpstate->size = guest_default_cfg.size; + fpstate->xfeatures = guest_default_cfg.features; + fpstate->xfd = 0; + } else { + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->xfd = init_fpstate.xfd; + } + fpstate->user_size = fpu_user_cfg.default_size; - fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; - fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; - __fpstate_reset(fpu->fpstate, init_fpstate.xfd); + __fpstate_reset(fpu->fpstate); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; - /* Same defaults for guests */ - fpu->guest_perm = fpu->perm; + + fpu->guest_perm.__state_perm = guest_default_cfg.features; + fpu->guest_perm.__state_size = guest_default_cfg.size; + /* + * User features and sizes are always identical between host and + * guest FPUs, which allows for common guest and userspace ABI. + */ + fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 99db41bf9fa6..ff988b9ea39f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -205,6 +205,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; + guest_default_cfg.size = size; } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9aa9ac8399ae..12ed75c1b567 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -57,7 +57,7 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "Control-flow User registers", - "Control-flow Kernel registers (unused)", + "Control-flow Kernel registers (KVM only)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", @@ -81,6 +81,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, + [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, @@ -372,6 +373,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_CET_KERNEL | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) @@ -573,6 +575,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state); case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: @@ -743,6 +746,9 @@ static int __init init_xstate_size(void) fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); + guest_default_cfg.size = + xstate_calculate_size(guest_default_cfg.features, compacted); + return 0; } @@ -763,6 +769,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; + guest_default_cfg.size = legacy_size; /* * Prevent enabling the static branch which enables writes to the @@ -773,6 +780,24 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) fpstate_reset(x86_task_fpu(current)); } +static u64 __init host_default_mask(void) +{ + /* + * Exclude dynamic features (require userspace opt-in) and features + * that are supported only for KVM guests. + */ + return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR); +} + +static u64 __init guest_default_mask(void) +{ + /* + * Exclude dynamic features, which require userspace opt-in even + * for KVM guests. + */ + return ~(u64)XFEATURE_MASK_USER_DYNAMIC; +} + /* * Enable and initialize the xsave feature. * Called once per system bootup. @@ -855,12 +880,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; - /* Clean out dynamic features from default */ - fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; - fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; - - fpu_user_cfg.default_features = fpu_user_cfg.max_features; - fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + /* + * Now, given maximum feature set, determine default values by + * applying default masks. + */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask(); + fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask(); + guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask(); /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 6290dd120f5e..ff40f09ad911 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(struct task_struct *tsk) +static void task_update_io_bitmap(void) { + struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { @@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk) struct io_bitmap *iobm = tsk->thread.io_bitmap; tsk->thread.io_bitmap = NULL; - task_update_io_bitmap(tsk); + /* + * Don't touch the TSS when invoked on a failed fork(). TSS + * reflects the state of @current and not the state of @tsk. + */ + if (tsk == current) + task_update_io_bitmap(); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(current); - + task_update_io_bitmap(); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 81f9b78e0f7b..9ed29ff10e59 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -380,61 +380,18 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } -/* - * De-multiplexing posted interrupts is on the performance path, the code - * below is written to optimize the cache performance based on the following - * considerations: - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently - * accessed by both CPU and IOMMU. - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg - * for checking and clearing posted interrupt request (PIR), a 256 bit field - * within the PID. - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache - * line when posting interrupts and setting control bits. - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID - * cache line. The cache line states after each operation are as follows: - * CPU IOMMU PID Cache line state - * --------------------------------------------------------------- - *...read64 exclusive - *...lock xchg64 modified - *... post/atomic swap invalid - *...------------------------------------------------------------- - * - * To reduce L1 data cache miss, it is important to avoid contention with - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used - * to dispatch interrupt handlers. - * - * In addition, the code is trying to keep the cache line state consistent - * as much as possible. e.g. when making a copy and clearing the PIR - * (assuming non-zero PIR bits are present in the entire PIR), it does: - * read, read, read, read, xchg, xchg, xchg, xchg - * instead of: - * read, xchg, read, xchg, read, xchg, read, xchg - */ -static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { - int i, vec = FIRST_EXTERNAL_VECTOR; - unsigned long pir_copy[4]; - bool handled = false; + unsigned long pir_copy[NR_PIR_WORDS]; + int vec = FIRST_EXTERNAL_VECTOR; - for (i = 0; i < 4; i++) - pir_copy[i] = pir[i]; - - for (i = 0; i < 4; i++) { - if (!pir_copy[i]) - continue; + if (!pi_harvest_pir(pir, pir_copy)) + return false; - pir_copy[i] = arch_xchg(&pir[i], 0); - handled = true; - } - - if (handled) { - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) - call_irq_handler(vec, regs); - } + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); - return handled; + return true; } /* @@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. */ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { - if (!handle_pending_pir(pid->pir64, regs)) + if (!handle_pending_pir(pid->pir, regs)) break; } @@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * process PIR bits one last time such that handling the new interrupts * are not delayed until the next IRQ. */ - handle_pending_pir(pid->pir64, regs); + handle_pending_pir(pid->pir, regs); apic_eoi(); irq_exit(); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9cea1fc36c18..243a769fdd97 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -59,6 +59,18 @@ static ssize_t sched_itmt_enabled_write(struct file *filp, return result; } +static int sched_core_priority_show(struct seq_file *s, void *unused) +{ + int cpu; + + seq_puts(s, "CPU #\tPriority\n"); + for_each_possible_cpu(cpu) + seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu)); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(sched_core_priority); + static const struct file_operations dfs_sched_itmt_fops = { .read = debugfs_read_file_bool, .write = sched_itmt_enabled_write, @@ -67,6 +79,7 @@ static const struct file_operations dfs_sched_itmt_fops = { }; static struct dentry *dfs_sched_itmt; +static struct dentry *dfs_sched_core_prio; /** * sched_set_itmt_support() - Indicate platform supports ITMT @@ -102,6 +115,14 @@ int sched_set_itmt_support(void) return -ENOMEM; } + dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644, + arch_debugfs_dir, NULL, + &sched_core_priority_fops); + if (IS_ERR_OR_NULL(dfs_sched_core_prio)) { + dfs_sched_core_prio = NULL; + return -ENOMEM; + } + sched_itmt_capable = true; sysctl_sched_itmt_enabled = 1; @@ -133,6 +154,8 @@ void sched_clear_itmt_support(void) debugfs_remove(dfs_sched_itmt); dfs_sched_itmt = NULL; + debugfs_remove(dfs_sched_core_prio); + dfs_sched_core_prio = NULL; if (sysctl_sched_itmt_enabled) { /* disable sched_itmt if we are no longer ITMT capable */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 68530fad05f7..24a41f0e0cf1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -27,6 +27,8 @@ #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ +#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */ + /* * Defines lowest physical address for various segments. Not sure where @@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", image->elf_load_addr); + + if (image->dm_crypt_keys_addr != 0) + len += sprintf(cmdline_ptr + len, + "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; @@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params, #endif /* CONFIG_IMA_KEXEC */ } +static void setup_kho(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int setup_data_offset) +{ + struct setup_data *sd = (void *)params + setup_data_offset; + struct kho_data *kho = (void *)sd + sizeof(*sd); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return; + + sd->type = SETUP_KEXEC_KHO; + sd->len = sizeof(struct kho_data); + + /* Only add if we have all KHO images in place */ + if (!image->kho.fdt || !image->kho.scratch) + return; + + /* Add setup data */ + kho->fdt_addr = image->kho.fdt; + kho->fdt_size = PAGE_SIZE; + kho->scratch_addr = image->kho.scratch->mem; + kho->scratch_size = image->kho.scratch->bufsz; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = params_load_addr + setup_data_offset; +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct ima_setup_data); } + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + /* Setup space to store preservation metadata */ + setup_kho(image, params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct kho_data); + } + /* Setup RNG seed */ setup_rng_seed(params, params_load_addr, setup_data_offset); @@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel, ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); + ret = crash_load_dm_crypt_keys(image); + if (ret == -ENOENT) { + kexec_dprintk("No dm crypt key to load\n"); + } else if (ret) { + pr_err("Failed to load dm crypt keys\n"); + return ERR_PTR(ret); + } + if (image->dm_crypt_keys_addr && + cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN > + header->cmdline_size) { + pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } } #endif @@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_sz = efi_get_runtime_map_size(); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; + if (image->dm_crypt_keys_addr) + params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + @@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct kho_data); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 102641fd2172..8b1a9733d13e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) struct perf_event *bp; /* Disable hardware debugging while we are in kgdb: */ - set_debugreg(0UL, 7); + set_debugreg(DR7_FIXED_1, 7); for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index b68d4be9464e..d547de9b3ed8 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -40,7 +40,7 @@ static const struct bin_attribute boot_params_data_attr = { .name = "data", .mode = S_IRUGO, }, - .read_new = boot_params_data_read, + .read = boot_params_data_read, .size = sizeof(boot_params), }; @@ -56,7 +56,7 @@ static const struct bin_attribute *const boot_params_data_attrs[] = { static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, - .bin_attrs_new = boot_params_data_attrs, + .bin_attrs = boot_params_data_attrs, }; static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) @@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = { .name = "data", .mode = S_IRUGO, }, - .read_new = setup_data_data_read, + .read = setup_data_data_read, }; static struct attribute *setup_data_type_attrs[] = { @@ -265,7 +265,7 @@ static const struct bin_attribute *const setup_data_data_attrs[] = { static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, - .bin_attrs_new = setup_data_data_attrs, + .bin_attrs = setup_data_data_attrs, }; static int __init create_setup_data_node(struct kobject *parent, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 921c1c783bc1..8ae750cde0c6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -420,7 +420,7 @@ static u64 kvm_steal_clock(int cpu) return steal; } -static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 949c9e4bfad2..697fb99406e6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -630,13 +630,35 @@ static void kexec_mark_crashkres(bool protect) kexec_mark_range(control, crashk_res.end, protect); } +/* make the memory storing dm crypt keys in/accessible */ +static void kexec_mark_dm_crypt_keys(bool protect) +{ + unsigned long start_paddr, end_paddr; + unsigned int nr_pages; + + if (kexec_crash_image->dm_crypt_keys_addr) { + start_paddr = kexec_crash_image->dm_crypt_keys_addr; + end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1; + nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE; + if (protect) + set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages); + else + __set_memory_prot( + (unsigned long)phys_to_virt(start_paddr), + nr_pages, + __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW)); + } +} + void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); + kexec_mark_dm_crypt_keys(true); } void arch_kexec_unprotect_crashkres(void) { + kexec_mark_dm_crypt_keys(false); kexec_mark_crashkres(false); } #endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c1d2dac72b9c..1b7960cf6eb0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -176,6 +176,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; + clear_tsk_thread_flag(p, TIF_IO_BITMAP); p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -333,13 +334,21 @@ DEFINE_PER_CPU(u64, msr_misc_features_shadow); static void set_cpuid_faulting(bool on) { - u64 msrval; - msrval = this_cpu_read(msr_misc_features_shadow); - msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; - msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); - this_cpu_write(msr_misc_features_shadow, msrval); - wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + u64 msrval; + + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (on) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT); + else + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT); + } } static void disable_cpuid(void) @@ -464,6 +473,11 @@ void native_tss_update_io_bitmap(void) } else { struct io_bitmap *iobm = t->io_bitmap; + if (WARN_ON_ONCE(!iobm)) { + clear_thread_flag(TIF_IO_BITMAP); + native_tss_invalidate_io_bitmap(); + } + /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. @@ -901,16 +915,24 @@ static __init bool prefer_mwait_c1_over_halt(void) */ static __cpuidle void mwait_idle(void) { + if (need_resched()) + return; + + x86_idle_clear_cpu_buffers(); + if (!current_set_polling_and_test()) { const void *addr = ¤t_thread_info()->flags; alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); __monitor(addr, 0, 0); - if (!need_resched()) { - __sti_mwait(0, 0); - raw_local_irq_disable(); - } + if (need_resched()) + goto out; + + __sti_mwait(0, 0); + raw_local_irq_disable(); } + +out: __current_clr_polling(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9bd4fa694da5..3ef15c2f152f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1)) return; printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", @@ -208,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f39ff02e498d..52a5c03c353c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -133,7 +133,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400))) { + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", @@ -705,7 +705,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); + + /* Reset hw history on AMD CPUs */ + if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS)) + wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 095f04bdabdc..3dcadc13f09a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1236,7 +1236,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET64_GENERAL] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1244,7 +1244,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .set = genregs_set }, [REGSET64_FP] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1253,7 +1253,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .set = xfpregs_set }, [REGSET64_XSTATE] = { - .core_note_type = NT_X86_XSTATE, + USER_REGSET_NOTE_TYPE(X86_XSTATE), .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, @@ -1261,7 +1261,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .set = xstateregs_set }, [REGSET64_IOPERM] = { - .core_note_type = NT_386_IOPERM, + USER_REGSET_NOTE_TYPE(386_IOPERM), .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), @@ -1270,7 +1270,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { }, #ifdef CONFIG_X86_USER_SHADOW_STACK [REGSET64_SSP] = { - .core_note_type = NT_X86_SHSTK, + USER_REGSET_NOTE_TYPE(X86_SHSTK), .n = 1, .size = sizeof(u64), .align = sizeof(u64), @@ -1297,7 +1297,7 @@ static const struct user_regset_view user_x86_64_view = { #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET32_GENERAL] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1305,7 +1305,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .set = genregs32_set }, [REGSET32_FP] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1314,7 +1314,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .set = fpregs_set }, [REGSET32_XFP] = { - .core_note_type = NT_PRXFPREG, + USER_REGSET_NOTE_TYPE(PRXFPREG), .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1323,7 +1323,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .set = xfpregs_set }, [REGSET32_XSTATE] = { - .core_note_type = NT_X86_XSTATE, + USER_REGSET_NOTE_TYPE(X86_XSTATE), .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, @@ -1331,7 +1331,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .set = xstateregs_set }, [REGSET32_TLS] = { - .core_note_type = NT_386_TLS, + USER_REGSET_NOTE_TYPE(386_TLS), .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, .size = sizeof(struct user_desc), @@ -1341,7 +1341,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .set = regset_tls_set }, [REGSET32_IOPERM] = { - .core_note_type = NT_386_IOPERM, + USER_REGSET_NOTE_TYPE(386_IOPERM), .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7d9ed79a93c0..0792f31961ac 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -213,8 +213,10 @@ arch_initcall(init_x86_sysctl); */ struct screen_info screen_info; EXPORT_SYMBOL(screen_info); +#if defined(CONFIG_FIRMWARE_EDID) struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); +#endif extern int root_mountflags; @@ -282,8 +284,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa_symbol(_brk_start), - _brk_end - _brk_start); + memblock_reserve_kern(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -356,7 +358,7 @@ static void __init early_reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image); } static void __init reserve_initrd(void) @@ -409,7 +411,7 @@ static void __init add_early_ima_buffer(u64 phys_addr) } if (data->size) { - memblock_reserve(data->addr, data->size); + memblock_reserve_kern(data->addr, data->size); ima_kexec_buffer_phys = data->addr; ima_kexec_buffer_size = data->size; } @@ -447,6 +449,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) } #endif +static void __init add_kho(u64 phys_addr, u32 data_len) +{ + struct kho_data *kho; + u64 addr = phys_addr + sizeof(struct setup_data); + u64 size = data_len - sizeof(struct setup_data); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n"); + return; + } + + kho = early_memremap(addr, size); + if (!kho) { + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n", + addr, size); + return; + } + + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + + early_memunmap(kho, size); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -475,6 +500,9 @@ static void __init parse_setup_data(void) case SETUP_IMA: add_early_ima_buffer(pa_data); break; + case SETUP_KEXEC_KHO: + add_kho(pa_data, data_len); + break; case SETUP_RNG_SEED: data = early_memremap(pa_data, data_len); add_bootloader_randomness(data->data, data->len); @@ -499,7 +527,9 @@ static void __init parse_boot_params(void) { ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; +#if defined(CONFIG_FIRMWARE_EDID) edid_info = boot_params.edid_info; +#endif #ifdef CONFIG_X86_32 apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; @@ -549,7 +579,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) len = sizeof(*data); pa_next = data->next; - memblock_reserve(pa_data, sizeof(*data) + data->len); + memblock_reserve_kern(pa_data, sizeof(*data) + data->len); if (data->type == SETUP_INDIRECT) { len += data->len; @@ -563,7 +593,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) indirect = (struct setup_indirect *)data->data; if (indirect->type != SETUP_INDIRECT) - memblock_reserve(indirect->addr, indirect->len); + memblock_reserve_kern(indirect->addr, indirect->len); } pa_data = pa_next; @@ -766,8 +796,8 @@ static void __init early_reserve_memory(void) * __end_of_kernel_reserve symbol must be explicitly reserved with a * separate memblock_reserve() or they will be discarded. */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + memblock_reserve_kern(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * The first 4Kb of memory is a BIOS owned area, but generally it is diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 98123ff10506..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ee9453891901..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18266cc3d98c..b014e6d229f9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -299,3 +299,27 @@ struct smp_ops smp_ops = { .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); + +int arch_cpu_rescan_dead_smt_siblings(void) +{ + enum cpuhp_smt_control old = cpu_smt_control; + int ret; + + /* + * If SMT has been disabled and SMT siblings are in HLT, bring them back + * online and offline them again so that they end up in MWAIT proper. + * + * Called with hotplug enabled. + */ + if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED) + return 0; + + ret = cpuhp_smt_enable(); + if (ret) + return ret; + + ret = cpuhp_smt_disable(old); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b90d872aa0c8..33e166f6ab12 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -478,44 +478,41 @@ static int x86_cluster_flags(void) */ static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_topology[6]; - -static void __init build_sched_topology(void) -{ - int i = 0; - -#ifdef CONFIG_SCHED_SMT - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) - }; -#endif +static struct sched_domain_topology_level x86_topology[] = { + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; + SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) - }; + SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), #endif + SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + { NULL }, +}; + +static void __init build_sched_topology(void) +{ + struct sched_domain_topology_level *topology = x86_topology; + /* - * When there is NUMA topology inside the package skip the PKG domain - * since the NUMA domains will auto-magically create the right spanning - * domains based on the SLIT. + * When there is NUMA topology inside the package invalidate the + * PKG domain since the NUMA domains will auto-magically create the + * right spanning domains based on the SLIT. */ - if (!x86_has_numa_in_package) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) - }; + if (x86_has_numa_in_package) { + unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; + + memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); } /* - * There must be one trailing NULL entry left. + * Drop the SMT domains if there is only one thread per-core + * since it'll get degenerated by the scheduler anyways. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + if (cpu_smt_num_threads <= 1) + ++topology; - set_sched_topology(x86_topology); + set_sched_topology(topology); } void set_cpu_sibling_map(int cpu) @@ -695,7 +692,7 @@ static void send_init_sequence(u32 phys_apicid) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip) +static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu) { unsigned long send_status = 0, accept_status = 0; int num_starts, j, maxlvt; @@ -842,7 +839,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle) { unsigned long start_ip = real_mode_header->trampoline_start; int ret; @@ -896,11 +893,11 @@ static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) * - Use an INIT boot APIC message */ if (apic->wakeup_secondary_cpu_64) - ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu); else if (apic->wakeup_secondary_cpu) - ret = apic->wakeup_secondary_cpu(apicid, start_ip); + ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu); else - ret = wakeup_secondary_cpu_via_init(apicid, start_ip); + ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu); /* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 94c0236963c6..36354b470590 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -352,7 +352,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", - report_ubsan_failure(regs, ud_imm), + report_ubsan_failure(ud_imm), (void *)regs->ip); } break; @@ -1022,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. + * + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. * - * Keep it simple: clear DR6 immediately. + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. + * + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -1239,13 +1247,13 @@ out: /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED @@ -1264,7 +1272,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which - * debug_read_clear_dr6() returns for the IDT entry points. + * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); @@ -1279,7 +1287,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fe8ea8c097de..2c86673155c9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -95,6 +95,8 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL + select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -129,6 +131,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) @@ -154,6 +166,16 @@ config KVM_AMD_SEV Encrypted State (SEV-ES), and Secure Encrypted Virtualization with Secure Nested Paging (SEV-SNP) technologies on AMD processors. +config KVM_IOAPIC + bool "I/O APIC, PIC, and PIT emulation" + default y + depends on KVM + help + Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. + for full in-kernel APIC emulation. + + If unsure, say Y. + config KVM_SMM bool "System Management Mode emulation" default y diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..c4b8950c7abe 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror include $(srctree)/virt/kvm/Makefile.kvm -kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o +kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \ + debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o kvm-$(CONFIG_KVM_HYPERV) += hyperv.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o @@ -20,6 +19,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ecd85f4801cc..e2836a255b16 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -81,17 +81,8 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } -/* - * Magic value used by KVM when querying userspace-provided CPUID entries and - * doesn't care about the CPIUD index because the index of the function in - * question is not significant. Note, this magic value must have at least one - * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() - * to avoid false positives when processing guest CPUID input. - */ -#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull - -static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, - u32 function, u64 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -108,8 +99,8 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, */ lockdep_assert_irqs_enabled(); - for (i = 0; i < vcpu->arch.cpuid_nent; i++) { - e = &vcpu->arch.cpuid_entries[i]; + for (i = 0; i < nent; i++) { + e = &entries[i]; if (e->function != function) continue; @@ -140,26 +131,7 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, return NULL; } - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - return cpuid_entry2_find(vcpu, function, index); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function) -{ - return cpuid_entry2_find(vcpu, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - -/* - * cpuid_entry2_find() and KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used - * directly outside of kvm_find_cpuid_entry() and kvm_find_cpuid_entry_index(). - */ -#undef KVM_CPUID_INDEX_NOT_SIGNIFICANT +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { @@ -492,6 +464,20 @@ not_found: return 36; } +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008); + if (best) + return (best->eax >> 16) & 0xff; +not_found: + return 0; +} + /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and @@ -992,6 +978,8 @@ void kvm_set_cpu_caps(void) F(FZRM), F(FSRS), F(FSRC), + F(WRMSRNS), + X86_64_F(LKGS), F(AMX_FP16), F(AVX_IFMA), F(LAM), @@ -1107,6 +1095,7 @@ void kvm_set_cpu_caps(void) F(AMD_SSB_NO), F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), + F(AMD_IBRS_SAME_MODE), F(AMD_PSFD), F(AMD_IBPB_RET), ); @@ -1164,6 +1153,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP), + F(WRMSR_XX_BASE_NS), /* * Synthesize "LFENCE is serializing" into the AMD-defined entry * in KVM's supported CPUID, i.e. if the feature is reported as @@ -1176,17 +1166,27 @@ void kvm_set_cpu_caps(void) */ SYNTHESIZED_F(LFENCE_RDTSC), /* SmmPgCfgLock */ + /* 4: Resv */ + SYNTHESIZED_F(VERW_CLEAR), F(NULL_SEL_CLR_BASE), + /* UpperAddressIgnore */ F(AUTOIBRS), + F(PREFETCHI), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ - F(WRMSR_XX_BASE_NS), + /* GpOnUserCpuid */ + /* EPSF */ SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), F(SRSO_USER_KERNEL_NO), ); + kvm_cpu_cap_init(CPUID_8000_0021_ECX, + SYNTHESIZED_F(TSA_SQ_NO), + SYNTHESIZED_F(TSA_L1_NO), + ); + kvm_cpu_cap_init(CPUID_8000_0022_EAX, F(PERFMON_V2), ); @@ -1756,8 +1756,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->ecx = entry->edx = 0; + entry->ebx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); + cpuid_entry_override(entry, CPUID_8000_0021_ECX); break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d2884162a46a..d3f5ae15a7ca 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,10 +11,34 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, + int nent, u32 function, u64 index); +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2() + * to avoid false positives when processing guest CPUID input. + * + * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of + * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry(). + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); +} + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} + int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -34,6 +58,7 @@ void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 24f0318c50d7..72b19a88a776 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vpidx); + if (!level) + return -1; + + synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; - return synic_set_irq(synic, sint); + return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) @@ -1979,6 +1983,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) goto out_flush_all; + if (is_noncanonical_invlpg_address(entries[i], vcpu)) + continue; + /* * Lower 12 bits of 'address' encode the number of additional * pages to flush. @@ -2001,11 +2008,11 @@ out_flush_all: static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 913bfc96959c..6ce160ffa678 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 739aa6c0d0c3..d1b79b418c05 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work) if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) return; - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) atomic_set(&pit->pit_state.irq_ack, 1); } -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) +static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) { struct kvm_kpit_state *ps = &pit->pit_state; struct kvm *kvm = pit->kvm; @@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) } } -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start) +static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start) { u8 saved_mode; @@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) kvm_pit_reset_reinject(pit); } +int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; + + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); + + mutex_lock(&kps->lock); + memcpy(ps, &kps->channels, sizeof(*ps)); + mutex_unlock(&kps->lock); + return 0; +} + +int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int i; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); + for (i = 0; i < 3; i++) + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); + mutex_unlock(&pit->pit_state.lock); + return 0; +} + +int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); + return 0; +} + +int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int start = 0; + int i; + u32 prev_legacy, cur_legacy; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&pit->pit_state.channels, &ps->channels, + sizeof(pit->pit_state.channels)); + pit->pit_state.flags = ps->flags; + for (i = 0; i < 3; i++) + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, + start && i == 0); + mutex_unlock(&pit->pit_state.lock); + return 0; +} + +int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) +{ + struct kvm_pit *pit = kvm->arch.vpit; + + /* pit->pit_state.lock was overloaded to prevent userspace from getting + * an inconsistent state after running multiple KVM_REINJECT_CONTROL + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. + */ + mutex_lock(&pit->pit_state.lock); + kvm_pit_set_reinject(pit, control->pit_reinject); + mutex_unlock(&pit->pit_state.lock); + + return 0; +} + static const struct kvm_io_device_ops pit_dev_ops = { .read = pit_ioport_read, .write = pit_ioport_write, @@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (!pit) return NULL; - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) - goto fail_request; - mutex_init(&pit->pit_state.lock); pid = get_pid(task_tgid(current)); @@ -726,8 +795,6 @@ fail_register_pit: kvm_pit_set_reinject(pit, false); kthread_destroy_worker(pit->worker); fail_kthread: - kvm_free_irq_source_id(kvm, pit->irq_source_id); -fail_request: kfree(pit); return NULL; } @@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm) kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); - kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index a768212ba821..60fa499d2f8a 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -6,6 +6,11 @@ #include <kvm/iodev.h> +#include <uapi/asm/kvm.h> + +#include "ioapic.h" + +#ifdef CONFIG_KVM_IOAPIC struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -42,7 +47,6 @@ struct kvm_pit { struct kvm_io_device speaker_dev; struct kvm *kvm; struct kvm_kpit_state pit_state; - int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; struct kthread_worker *worker; struct kthread_work expired; @@ -55,11 +59,14 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 +int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps); +int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps); +int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control); + struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); - -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start); -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); +#endif /* CONFIG_KVM_IOAPIC */ #endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index a8fb19940975..2ac7f1678c46 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -31,6 +31,8 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/bitops.h> + +#include "ioapic.h" #include "irq.h" #include <linux/kvm_host.h> @@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) +int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { + struct kvm_pic *s = kvm->arch.vpic; + int irq = e->irqchip.pin; int ret, irq_level; BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); @@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) return ret; } -void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) -{ - int i; - - pic_lock(s); - for (i = 0; i < PIC_NUM_PINS; i++) - __clear_bit(irq_source_id, &s->irq_states[i]); - pic_unlock(s); -} - /* * acknowledge interrupt 'irq' */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 995eb5054360..2b5d389bca5f 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -41,11 +41,11 @@ #include <asm/processor.h> #include <asm/page.h> #include <asm/current.h> -#include <trace/events/kvm.h> #include "ioapic.h" #include "lapic.h" #include "irq.h" +#include "trace.h" static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); @@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - e->fields.dest_id, dm) || - kvm_apic_pending_eoi(vcpu, e->fields.vector)) - __set_bit(e->fields.vector, - ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm, + e->fields.vector, ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); @@ -313,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_srcu(&kvm->irq_srcu); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + struct kvm_irq_mask_notifier *kimn; + int idx, gsi; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; @@ -482,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) return ret; } -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level, bool line_status) +int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int irq = e->irqchip.pin; int ret, irq_level; BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); @@ -499,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, return ret; } -void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) -{ - int i; - - spin_lock(&ioapic->lock); - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) - __clear_bit(irq_source_id, &ioapic->irq_states[i]); - spin_unlock(&ioapic->lock); -} - static void kvm_ioapic_eoi_inject_work(struct work_struct *work) { int i; @@ -721,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm) return -ENOMEM; spin_lock_init(&ioapic->lock); INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); + INIT_HLIST_HEAD(&ioapic->mask_notifier_list); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 539333ac4b38..bf28dbc11ff6 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -86,8 +86,24 @@ struct kvm_ioapic { struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; u32 irr_delivered; + + /* reads protected by irq_srcu, writes by irq_lock */ + struct hlist_head mask_notifier_list; +}; + +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; }; +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + #ifdef DEBUG #define ASSERT(x) \ do { \ @@ -103,7 +119,7 @@ do { \ static inline int ioapic_in_kernel(struct kvm *kvm) { - return irqchip_kernel(kvm); + return irqchip_full(kvm); } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -111,13 +127,15 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level, bool line_status); -void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); +int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); + void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 63f66c51975a..16da89259011 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -11,9 +11,12 @@ #include <linux/export.h> #include <linux/kvm_host.h> +#include <linux/kvm_irqfd.h> +#include "hyperv.h" +#include "ioapic.h" #include "irq.h" -#include "i8254.h" +#include "trace.h" #include "x86.h" #include "xen.h" @@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v) return v->arch.pending_external_vector != -1; } +static int get_userspace_extint(struct kvm_vcpu *vcpu) +{ + int vector = vcpu->arch.pending_external_vector; + + vcpu->arch.pending_external_vector = -1; + return vector; +} + /* * check if there is pending interrupt from * non-APIC source without intack. @@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!kvm_apic_accept_pic_intr(v)) return 0; - if (irqchip_split(v->kvm)) - return pending_userspace_extint(v); - else +#ifdef CONFIG_KVM_IOAPIC + if (pic_in_kernel(v->kvm)) return v->kvm->arch.vpic->output; +#endif + + WARN_ON_ONCE(!irqchip_split(v->kvm)); + return pending_userspace_extint(v); } /* @@ -100,6 +114,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; + if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected) + return kvm_x86_call(protected_apic_has_interrupt)(v); + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); @@ -123,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) return v->kvm->arch.xen.upcall_vector; #endif - if (irqchip_split(v->kvm)) { - int vector = v->arch.pending_external_vector; - - v->arch.pending_external_vector = -1; - return vector; - } else +#ifdef CONFIG_KVM_IOAPIC + if (pic_in_kernel(v->kvm)) return kvm_pic_read_irq(v->kvm); /* PIC */ +#endif + + WARN_ON_ONCE(!irqchip_split(v->kvm)); + return get_userspace_extint(v); } EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); @@ -160,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); +#ifdef CONFIG_KVM_IOAPIC __kvm_migrate_pit_timer(vcpu); +#endif kvm_x86_call(migrate_timers)(vcpu); } @@ -168,10 +187,532 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) { bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE; - return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); + return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm); } bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) { return irqchip_in_kernel(kvm); } + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, struct dest_map *dest_map) +{ + int r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { + pr_info("apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_lowest_prio_delivery(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { + if (!kvm_vector_hashing_enabled()) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } + } + } + + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + +static void kvm_msi_to_lapic_irq(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) +{ + struct msi_msg msg = { .address_lo = e->msi.address_lo, + .address_hi = e->msi.address_hi, + .data = e->msi.data }; + + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? + (u64)msg.address_hi << 32 : 0), msg.data); + + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); + irq->vector = msg.arch_data.vector; + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); + irq->trig_mode = msg.arch_data.is_level; + irq->delivery_mode = msg.arch_data.delivery_mode << 8; + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; + irq->level = 1; + irq->shorthand = APIC_DEST_NOSHORT; +} + +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct kvm_lapic_irq irq; + + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + if (!level) + return -1; + + kvm_msi_to_lapic_irq(kvm, e, &irq); + + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); +} + +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_lapic_irq irq; + int r; + + switch (e->type) { +#ifdef CONFIG_KVM_HYPERV + case KVM_IRQ_ROUTING_HV_SINT: + return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level, + line_status); +#endif + + case KVM_IRQ_ROUTING_MSI: + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_msi_to_lapic_irq(kvm, e, &irq); + + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + return r; + break; + +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); +#endif + default: + break; + } + + return -EWOULDBLOCK; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + /* We can't check irqchip_in_kernel() here as some callers are + * currently initializing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ + switch (ue->type) { +#ifdef CONFIG_KVM_IOAPIC + case KVM_IRQ_ROUTING_IRQCHIP: + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_SLAVE: + e->irqchip.pin += PIC_NUM_PINS / 2; + fallthrough; + case KVM_IRQCHIP_PIC_MASTER: + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; + e->set = kvm_pic_set_irq; + break; + case KVM_IRQCHIP_IOAPIC: + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; + e->set = kvm_ioapic_set_irq; + break; + default: + return -EINVAL; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + break; +#endif + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + break; +#ifdef CONFIG_KVM_HYPERV + case KVM_IRQ_ROUTING_HV_SINT: + e->set = kvm_hv_synic_set_irq; + e->hv_sint.vcpu = ue->u.hv_sint.vcpu; + e->hv_sint.sint = ue->u.hv_sint.sint; + break; +#endif +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif + default: + return -EINVAL; + } + + return 0; +} + +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int r = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { + __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + struct kvm_lapic_irq irq; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + + kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq); + + if (!irq.trig_mode) + continue; + + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_HYPERV + kvm_hv_irq_routing_update(kvm); +#endif + + if (irqchip_split(kvm)) + kvm_make_scan_ioapic_request(kvm); +} + +static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *entry) +{ + unsigned int host_irq = irqfd->producer->irq; + struct kvm *kvm = irqfd->kvm; + struct kvm_vcpu *vcpu = NULL; + struct kvm_lapic_irq irq; + int r; + + if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass())) + return -EINVAL; + + if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { + kvm_msi_to_lapic_irq(kvm, entry, &irq); + + /* + * Force remapped mode if hardware doesn't support posting the + * virtual interrupt to a vCPU. Only IRQs are postable (NMIs, + * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support + * posting multicast/broadcast IRQs. If the interrupt can't be + * posted, the device MSI needs to be routed to the host so that + * the guest's desired interrupt can be synthesized by KVM. + * + * This means that KVM can only post lowest-priority interrupts + * if they have a single CPU as the destination, e.g. only if + * the guest has affined the interrupt to a single vCPU. + */ + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) + vcpu = NULL; + } + + if (!irqfd->irq_bypass_vcpu && !vcpu) + return 0; + + r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi, + vcpu, irq.vector); + if (r) { + WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu); + irqfd->irq_bypass_vcpu = NULL; + return r; + } + + irqfd->irq_bypass_vcpu = vcpu; + + trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu); + return 0; +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + int ret = 0; + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + + if (!kvm->arch.nr_possible_bypass_irqs++) + kvm_x86_call(pi_start_bypass)(kvm); + + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry); + if (ret) + kvm->arch.nr_possible_bypass_irqs--; + } + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + int ret; + + WARN_ON(irqfd->producer != prod); + + /* + * If the producer of an IRQ that is currently being posted to a vCPU + * is unregistered, change the associated IRTE back to remapped mode as + * the IRQ has been released (or repurposed) by the device driver, i.e. + * KVM must relinquish control of the IRTE. + */ + spin_lock_irq(&kvm->irqfds.lock); + + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_pi_update_irte(irqfd, NULL); + if (ret) + pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", + irqfd->consumer.eventfd, ret); + } + irqfd->producer = NULL; + + kvm->arch.nr_possible_bypass_irqs--; + + spin_unlock_irq(&kvm->irqfds.lock); +} + +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI && + old->type != KVM_IRQ_ROUTING_MSI) + return; + + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + + kvm_pi_update_irte(irqfd, new); +} + +#ifdef CONFIG_KVM_IOAPIC +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } +#define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +}; + +int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} + +int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + struct kvm_pic *pic = kvm->arch.vpic; + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, &pic->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, &pic->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + kvm_get_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + struct kvm_pic *pic = kvm->arch.vpic; + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic->lock); + break; + case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic->lock); + break; + case KVM_IRQCHIP_IOAPIC: + kvm_set_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + kvm_pic_update_irq(pic); + return r; +} +#endif diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 76d46b2f41dd..5e62c1f79ce6 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -18,6 +18,8 @@ #include <kvm/iodev.h> #include "lapic.h" +#ifdef CONFIG_KVM_IOAPIC + #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) @@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm); void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); -static inline int irqchip_split(struct kvm *kvm) -{ - int mode = kvm->arch.irqchip_mode; +int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm); - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_SPLIT; -} +int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); +int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); -static inline int irqchip_kernel(struct kvm *kvm) +static inline int irqchip_full(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; @@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm) smp_rmb(); return mode == KVM_IRQCHIP_KERNEL; } +#else /* CONFIG_KVM_IOAPIC */ +static __always_inline int irqchip_full(struct kvm *kvm) +{ + return false; +} +#endif static inline int pic_in_kernel(struct kvm *kvm) { - return irqchip_kernel(kvm); + return irqchip_full(kvm); +} + + +static inline int irqchip_split(struct kvm *kvm) +{ + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_in_kernel(struct kvm *kvm) @@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); -int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c deleted file mode 100644 index 8136695f7b96..000000000000 --- a/arch/x86/kvm/irq_comm.c +++ /dev/null @@ -1,442 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * irq_comm.c: Common API for in kernel interrupt controller - * Copyright (c) 2007, Intel Corporation. - * - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/rculist.h> - -#include <trace/events/kvm.h> - -#include "irq.h" - -#include "ioapic.h" - -#include "lapic.h" - -#include "hyperv.h" -#include "x86.h" -#include "xen.h" - -static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_pic *pic = kvm->arch.vpic; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); -} - -static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, - line_status); -} - -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) -{ - int r = -1; - struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; - unsigned int dest_vcpus = 0; - - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - - if (irq->dest_mode == APIC_DEST_PHYSICAL && - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - pr_info("apic: phys broadcast and lowest prio\n"); - irq->delivery_mode = APIC_DM_FIXED; - } - - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (!kvm_lowest_prio_delivery(irq)) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; - } else { - __set_bit(i, dest_vcpu_bitmap); - dest_vcpus++; - } - } - } - - if (dest_vcpus != 0) { - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); - - lowest = kvm_get_vcpu(kvm, idx); - } - - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); - - return r; -} - -void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) -{ - struct msi_msg msg = { .address_lo = e->msi.address_lo, - .address_hi = e->msi.address_hi, - .data = e->msi.data }; - - trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? - (u64)msg.address_hi << 32 : 0), msg.data); - - irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); - irq->vector = msg.arch_data.vector; - irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); - irq->trig_mode = msg.arch_data.is_level; - irq->delivery_mode = msg.arch_data.delivery_mode << 8; - irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; - irq->level = 1; - irq->shorthand = APIC_DEST_NOSHORT; -} -EXPORT_SYMBOL_GPL(kvm_set_msi_irq); - -static inline bool kvm_msi_route_invalid(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e) -{ - return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); -} - -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, bool line_status) -{ - struct kvm_lapic_irq irq; - - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - - if (!level) - return -1; - - kvm_set_msi_irq(kvm, e, &irq); - - return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); -} - -#ifdef CONFIG_KVM_HYPERV -static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (!level) - return -1; - - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); -} -#endif - -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_lapic_irq irq; - int r; - - switch (e->type) { -#ifdef CONFIG_KVM_HYPERV - case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_set_sint(e, kvm, irq_source_id, level, - line_status); -#endif - - case KVM_IRQ_ROUTING_MSI: - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) - return r; - break; - -#ifdef CONFIG_KVM_XEN - case KVM_IRQ_ROUTING_XEN_EVTCHN: - if (!level) - return -1; - - return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); -#endif - default: - break; - } - - return -EWOULDBLOCK; -} - -int kvm_request_irq_source_id(struct kvm *kvm) -{ - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id; - - mutex_lock(&kvm->irq_lock); - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - - if (irq_source_id >= BITS_PER_LONG) { - pr_warn("exhausted allocatable IRQ sources!\n"); - irq_source_id = -EFAULT; - goto unlock; - } - - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - set_bit(irq_source_id, bitmap); -unlock: - mutex_unlock(&kvm->irq_lock); - - return irq_source_id; -} - -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) -{ - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - - mutex_lock(&kvm->irq_lock); - if (irq_source_id < 0 || - irq_source_id >= BITS_PER_LONG) { - pr_err("IRQ source ID out of range!\n"); - goto unlock; - } - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_kernel(kvm)) - goto unlock; - - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); -unlock: - mutex_unlock(&kvm->irq_lock); -} - -void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn) -{ - mutex_lock(&kvm->irq_lock); - kimn->irq = irq; - hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); - mutex_unlock(&kvm->irq_lock); -} - -void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn) -{ - mutex_lock(&kvm->irq_lock); - hlist_del_rcu(&kimn->link); - mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); -} - -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask) -{ - struct kvm_irq_mask_notifier *kimn; - int idx, gsi; - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); - if (gsi != -1) - hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) - if (kimn->irq == gsi) - kimn->func(kimn, mask); - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -bool kvm_arch_can_set_irq_routing(struct kvm *kvm) -{ - return irqchip_in_kernel(kvm); -} - -int kvm_set_routing_entry(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -{ - /* We can't check irqchip_in_kernel() here as some callers are - * currently initializing the irqchip. Other callers should therefore - * check kvm_arch_can_set_irq_routing() before calling this function. - */ - switch (ue->type) { - case KVM_IRQ_ROUTING_IRQCHIP: - if (irqchip_split(kvm)) - return -EINVAL; - e->irqchip.pin = ue->u.irqchip.pin; - switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_SLAVE: - e->irqchip.pin += PIC_NUM_PINS / 2; - fallthrough; - case KVM_IRQCHIP_PIC_MASTER: - if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) - return -EINVAL; - e->set = kvm_set_pic_irq; - break; - case KVM_IRQCHIP_IOAPIC: - if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) - return -EINVAL; - e->set = kvm_set_ioapic_irq; - break; - default: - return -EINVAL; - } - e->irqchip.irqchip = ue->u.irqchip.irqchip; - break; - case KVM_IRQ_ROUTING_MSI: - e->set = kvm_set_msi; - e->msi.address_lo = ue->u.msi.address_lo; - e->msi.address_hi = ue->u.msi.address_hi; - e->msi.data = ue->u.msi.data; - - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - break; -#ifdef CONFIG_KVM_HYPERV - case KVM_IRQ_ROUTING_HV_SINT: - e->set = kvm_hv_set_sint; - e->hv_sint.vcpu = ue->u.hv_sint.vcpu; - e->hv_sint.sint = ue->u.hv_sint.sint; - break; -#endif -#ifdef CONFIG_KVM_XEN - case KVM_IRQ_ROUTING_XEN_EVTCHN: - return kvm_xen_setup_evtchn(kvm, e, ue); -#endif - default: - return -EINVAL; - } - - return 0; -} - -bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) -{ - int r = 0; - unsigned long i; - struct kvm_vcpu *vcpu; - - if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) - return true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (++r == 2) - return false; - - *dest_vcpu = vcpu; - } - - return r == 1; -} -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); - -#define IOAPIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } -#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) - -#define PIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } -#define ROUTING_ENTRY2(irq) \ - IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) - -static const struct kvm_irq_routing_entry default_routing[] = { - ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), - ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), - ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), - ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), - ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), - ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), - ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), - ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), - ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), - ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), - ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), - ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), -}; - -int kvm_setup_default_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, default_routing, - ARRAY_SIZE(default_routing), 0); -} - -void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ - if (!irqchip_split(kvm)) - return; - kvm_make_scan_ioapic_request(kvm); -} - -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, - ulong *ioapic_handled_vectors) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_kernel_irq_routing_entry *entry; - struct kvm_irq_routing_table *table; - u32 i, nr_ioapic_pins; - int idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - nr_ioapic_pins = min_t(u32, table->nr_rt_entries, - kvm->arch.nr_reserved_ioapic_pins); - for (i = 0; i < nr_ioapic_pins; ++i) { - hlist_for_each_entry(entry, &table->map[i], link) { - struct kvm_lapic_irq irq; - - if (entry->type != KVM_IRQ_ROUTING_MSI) - continue; - - kvm_set_msi_irq(vcpu->kvm, entry, &irq); - - if (irq.trig_mode && - (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector))) - __set_bit(irq.vector, ioapic_handled_vectors); - } - } - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -void kvm_arch_irq_routing_update(struct kvm *kvm) -{ -#ifdef CONFIG_KVM_HYPERV - kvm_hv_irq_routing_update(kvm); -#endif -} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 28e3317124fd..8172c2042dd6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/math64.h> #include <linux/slab.h> +#include <asm/apic.h> #include <asm/processor.h> #include <asm/mce.h> #include <asm/msr.h> @@ -55,9 +56,6 @@ /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) -/* followed define is not in apicdef.h */ -#define MAX_APIC_VECTOR 256 -#define APIC_VECTORS_PER_REG 32 /* * Enable local APIC timer advancement (tscdeadline mode only) with adaptive @@ -79,42 +77,20 @@ module_param(lapic_timer_advance, bool, 0444); static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); -static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) -{ - *((u32 *) (regs + reg_off)) = val; -} - static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - __kvm_lapic_set_reg(apic->regs, reg_off, val); -} - -static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg) -{ - BUILD_BUG_ON(reg != APIC_ICR); - return *((u64 *) (regs + reg)); + apic_set_reg(apic->regs, reg_off, val); } static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg) { - return __kvm_lapic_get_reg64(apic->regs, reg); -} - -static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val) -{ - BUILD_BUG_ON(reg != APIC_ICR); - *((u64 *) (regs + reg)) = val; + return apic_get_reg64(apic->regs, reg); } static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic, int reg, u64 val) { - __kvm_lapic_set_reg64(apic->regs, reg, val); -} - -static inline int apic_test_vector(int vec, void *bitmap) -{ - return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); + apic_set_reg64(apic->regs, reg, val); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) @@ -125,16 +101,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } -static inline int __apic_test_and_set_vector(int vec, void *bitmap) -{ - return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int __apic_test_and_clear_vector(int vec, void *bitmap) -{ - return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); @@ -626,21 +592,6 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; -static int find_highest_vector(void *bitmap) -{ - int vec; - u32 *reg; - - for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; - vec >= 0; vec -= APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); - if (*reg) - return __fls(*reg) + vec; - } - - return -1; -} - static u8 count_vectors(void *bitmap) { int vec; @@ -648,34 +599,36 @@ static u8 count_vectors(void *bitmap) u8 count = 0; for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { - reg = bitmap + REG_POS(vec); + reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec); count += hweight32(*reg); } return count; } -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { + unsigned long pir_vals[NR_PIR_WORDS]; + u32 *__pir = (void *)pir_vals; u32 i, vec; - u32 pir_val, irr_val, prev_irr_val; + u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; + if (!pi_harvest_pir(pir, pir_vals)) + return false; + for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); - irr_val = *p_irr; - pir_val = READ_ONCE(pir[i]); - - if (pir_val) { - pir_val = xchg(&pir[i], 0); + irr_val = READ_ONCE(*p_irr); + if (__pir[i]) { prev_irr_val = irr_val; do { - irr_val = prev_irr_val | pir_val; + irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); @@ -691,7 +644,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); @@ -704,7 +657,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { - return find_highest_vector(apic->regs + APIC_IRR); + return apic_find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) @@ -727,10 +680,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { if (unlikely(apic->apicv_active)) { - kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); + apic_clear_vector(vec, apic->regs + APIC_IRR); } else { apic->irr_pending = false; - kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); + apic_clear_vector(vec, apic->regs + APIC_IRR); if (apic_search_irr(apic) != -1) apic->irr_pending = true; } @@ -742,9 +695,15 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) } EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); +static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic) +{ + return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec); +} + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), + apic_vector_to_isr(vec, apic))) return; /* @@ -779,7 +738,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) if (likely(apic->highest_isr_cache != -1)) return apic->highest_isr_cache; - result = find_highest_vector(apic->regs + APIC_ISR); + result = apic_find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); return result; @@ -787,7 +746,8 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), + apic_vector_to_isr(vec, apic))) return; /* @@ -1330,11 +1290,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) - kvm_lapic_set_vector(vector, - apic->regs + APIC_TMR); + apic_set_vector(vector, apic->regs + APIC_TMR); else - kvm_lapic_clear_vector(vector, - apic->regs + APIC_TMR); + apic_clear_vector(vector, apic->regs + APIC_TMR); } kvm_x86_call(deliver_interrupt)(apic, delivery_mode, @@ -1453,12 +1411,20 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - int trigger_mode; + int __maybe_unused trigger_mode; /* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) return; + /* + * If the intercepted EOI is for an IRQ that was pending from previous + * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely + * no longer need to be intercepted. + */ + if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) + kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); + /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; @@ -1466,12 +1432,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) return; } +#ifdef CONFIG_KVM_IOAPIC if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); +#endif } static int apic_set_eoi(struct kvm_lapic *apic) @@ -1790,8 +1758,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); + u32 reg; + /* + * Assume a timer IRQ was "injected" if the APIC is protected. KVM's + * copy of the vIRR is bogus, it's the responsibility of the caller to + * precisely check whether or not a timer IRQ is pending. + */ + if (apic->guest_apic_protected) + return true; + + reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; @@ -2650,6 +2627,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) kvm_recalculate_apic_map(vcpu->kvm); return 0; } +EXPORT_SYMBOL_GPL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { @@ -2958,6 +2936,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) if (!kvm_apic_present(vcpu)) return -1; + if (apic->guest_apic_protected) + return -1; + __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } @@ -3061,12 +3042,12 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, if (!kvm_x86_ops.x2apic_icr_is_split) { if (set) { - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + icr = apic_get_reg(s->regs, APIC_ICR) | + (u64)apic_get_reg(s->regs, APIC_ICR2) << 32; + apic_set_reg64(s->regs, APIC_ICR, icr); } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + icr = apic_get_reg64(s->regs, APIC_ICR); + apic_set_reg(s->regs, APIC_ICR2, icr >> 32); } } } @@ -3082,8 +3063,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) * Get calculated timer current count for remaining timer period (if * any) and store it in the returned register set. */ - __kvm_lapic_set_reg(s->regs, APIC_TMCCT, - __apic_read(vcpu->arch.apic, APIC_TMCCT)); + apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT)); return kvm_apic_state_fixup(vcpu, s, false); } @@ -3123,8 +3103,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); + +#ifdef CONFIG_KVM_IOAPIC if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); +#endif vcpu->arch.apic_arb_prio = 0; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1a8553ebdb42..72de14527698 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -4,6 +4,8 @@ #include <kvm/iodev.h> +#include <asm/apic.h> + #include <linux/kvm_host.h> #include "hyperv.h" @@ -21,6 +23,8 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + enum lapic_mode { LAPIC_MODE_DISABLED = 0, LAPIC_MODE_INVALID = X2APIC_ENABLE, @@ -65,6 +69,8 @@ struct kvm_lapic { bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; + /* Select registers in the vAPIC cannot be read/written. */ + bool guest_apic_protected; /* Number of bits set in ISR. */ s16 isr_count; /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -101,8 +107,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); @@ -143,22 +149,9 @@ void kvm_lapic_exit(void); u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic); -#define VEC_POS(v) ((v) & (32 - 1)) -#define REG_POS(v) (((v) >> 5) << 4) - -static inline void kvm_lapic_clear_vector(int vec, void *bitmap) -{ - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline void kvm_lapic_set_vector(int vec, void *bitmap) -{ - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) { - kvm_lapic_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); /* * irr_pending must be true if any interrupt is pending; set it after * APIC_IRR to avoid race with apic_clear_irr @@ -166,14 +159,9 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } -static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off) -{ - return *((u32 *) (regs + reg_off)); -} - static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) { - return __kvm_lapic_get_reg(apic->regs, reg_off); + return apic_get_reg(apic->regs, reg_off); } DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f2b36d32ef40..b4b6860ab971 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void) u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); @@ -234,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(void); +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); @@ -256,6 +257,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d1b632e33d2..6e838cb6c9e1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -1456,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ - if (kvm_x86_ops.cpu_dirty_log_size) + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -1982,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) return true; } +static __ro_after_init HLIST_HEAD(empty_page_hash); + +static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn) +{ + /* + * Ensure the load of the hash table pointer itself is ordered before + * loads to walk the table. The pointer is set at runtime outside of + * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of + * shadow pages becomes necessary only when KVM needs to shadow L1's + * TDP for an L2 guest. Pairs with the smp_store_release() in + * kvm_mmu_alloc_page_hash(). + */ + struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash); + + lockdep_assert_held(&kvm->mmu_lock); + + if (!page_hash) + return &empty_page_hash; + + return &page_hash[kvm_page_table_hashfn(gfn)]; +} + #define for_each_valid_sp(_kvm, _sp, _list) \ hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ + for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) @@ -2357,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_mmu_page *sp; bool created = false; + /* + * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as + * mmu_page_hash must be set prior to creating the first shadow root, + * i.e. reaching this point is fully serialized by slots_arch_lock. + */ + BUG_ON(!kvm->arch.mmu_page_hash); sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); @@ -3019,7 +3047,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { - if (prefetch) + if (prefetch && is_last_spte(*sptep, level) && + pfn == spte_to_pfn(*sptep)) return RET_PF_SPURIOUS; /* @@ -3033,7 +3062,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { + } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -3880,6 +3909,28 @@ out_unlock: return r; } +static int kvm_mmu_alloc_page_hash(struct kvm *kvm) +{ + struct hlist_head *h; + + if (kvm->arch.mmu_page_hash) + return 0; + + h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT); + if (!h) + return -ENOMEM; + + /* + * Ensure the hash table pointer is set only after all stores to zero + * the memory are retired. Pairs with the smp_load_acquire() in + * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to + * add (or remove) shadow pages, and so readers are guaranteed to see + * an empty list for their current mmu_lock critical section. + */ + smp_store_release(&kvm->arch.mmu_page_hash, h); + return 0; +} + static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; @@ -3899,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) if (kvm_shadow_root_allocated(kvm)) goto out_unlock; + r = kvm_mmu_alloc_page_hash(kvm); + if (r) + goto out_unlock; + /* - * Check if anything actually needs to be allocated, e.g. all metadata - * will be allocated upfront if TDP is disabled. + * Check if memslot metadata actually needs to be allocated, e.g. all + * metadata will be allocated upfront if TDP is disabled. */ if (kvm_memslots_have_rmaps(kvm) && kvm_page_track_write_tracking_enabled(kvm)) @@ -4835,19 +4890,6 @@ out_unlock: } #endif -bool kvm_mmu_may_ignore_guest_pat(void) -{ - /* - * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM - * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to - * honor the memtype from the guest's PAT so that guest accesses to - * memory that is DMA'd aren't cached against the guest's wishes. As a - * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores guest PAT (when EPT is enabled). - */ - return shadow_memtype_mask; -} - int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 @@ -4858,8 +4900,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; @@ -4873,6 +4914,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, do { if (signal_pending(current)) return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); @@ -4897,18 +4942,23 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, return -EIO; } } +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; + u64 direct_bits; u64 end; int r; if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. @@ -4917,15 +4967,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (r) return r; + direct_bits = 0; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; @@ -5589,12 +5642,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + int maxpa; + + if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM) + maxpa = cpuid_query_maxguestphyaddr(vcpu); + else + maxpa = cpuid_maxphyaddr(vcpu); + /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ - if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && maxpa <= 48) return 4; return max_tdp_level; @@ -5913,6 +5973,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -6674,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_tdp_mmu_zap_invalidated_roots(kvm, true); } -void kvm_mmu_init_vm(struct kvm *kvm) +int kvm_mmu_init_vm(struct kvm *kvm) { + int r; + kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (tdp_mmu_enabled) + if (tdp_mmu_enabled) { kvm_mmu_init_tdp_mmu(kvm); + } else { + r = kvm_mmu_alloc_page_hash(kvm); + if (r) + return r; + } kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; @@ -6691,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm) kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } static void mmu_free_vm_memory_caches(struct kvm *kvm) @@ -6702,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm) void kvm_mmu_uninit_vm(struct kvm *kvm) { + kvfree(kvm->arch.mmu_page_hash); + if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); @@ -7239,6 +7310,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, }; bool flush; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 75f00598289d..65f3c89d7c5d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -103,6 +103,9 @@ struct kvm_mmu_page { int root_count; refcount_t tdp_mmu_root_count; }; + + bool has_mapped_host_mmio; + union { /* These two members aren't used for TDP MMU */ struct { @@ -187,7 +190,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm return kvm_gfn_direct_bits(kvm); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -197,7 +201,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 561c331fd6ec..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 68e323568e95..ed762bb4b007 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r != RET_PF_CONTINUE) return r; +#if PTTYPE != PTTYPE_EPT /* - * Do not change pte_access if the pfn is a mmio page, otherwise - * we will cache the incorrect access into mmio spte. + * Treat the guest PTE protections as writable, supervisor-only if this + * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore + * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO, + * otherwise KVM will cache incorrect access information in the SPTE. */ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { @@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } +#endif r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0f9f47b4ab0e..df31039b5d63 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; -u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -96,8 +95,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); - access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; @@ -107,7 +104,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) return spte; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && @@ -128,6 +125,35 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio) +{ + /* + * Determining if a PFN is host MMIO is relative expensive. Cache the + * result locally (in the sole caller) to avoid doing the full query + * multiple times when creating a single SPTE. + */ + if (*is_host_mmio < 0) + *is_host_mmio = __kvm_is_mmio_pfn(pfn); + + return *is_host_mmio; +} + +static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + + if (root) + WRITE_ONCE(root->has_mapped_host_mmio, true); + else + WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true); + + /* + * Force vCPUs to exit and flush CPU buffers if the vCPU is using the + * affected root(s). + */ + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + /* * Returns true if the SPTE needs to be updated atomically due to having bits * that may be changed without holding mmu_lock, and for which KVM must not @@ -165,6 +191,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; + int is_host_mmio = -1; bool wrprot = false; /* @@ -177,7 +204,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; @@ -212,15 +239,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (shadow_memtype_mask) + if (kvm_x86_ops.get_mt_mask) spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_mmio_pfn(pfn, &is_host_mmio)); if (host_writable) spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; - if (shadow_me_value && !kvm_is_mmio_pfn(pfn)) + if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio)) spte |= shadow_me_value; spte |= (u64)pfn << PAGE_SHIFT; @@ -265,6 +292,11 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } + if (static_branch_unlikely(&cpu_buf_vm_clear) && + !kvm_vcpu_can_access_host_mmio(vcpu) && + kvm_is_mmio_pfn(pfn, &is_host_mmio)) + kvm_track_host_mmio_mapping(vcpu); + *new_spte = spte; return wrprot; } @@ -440,6 +472,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ @@ -463,13 +501,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; - /* - * EPT overrides the host MTRRs, and so KVM must program the desired - * memtype directly into the SPTEs. Note, this mask is just the mask - * of all bits that factor into the memtype, the actual memtype must be - * dynamically calculated, e.g. to ensure host MMIO is mapped UC. - */ - shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -521,12 +553,6 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; - /* - * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB - * memtype in the SPTEs, i.e. relies on host MTRRs to provide the - * correct memtype (WB is the "weakest" memtype). - */ - shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 79cdceba9857..3133f066927e 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; -extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -281,6 +280,16 @@ static inline bool is_mirror_sptep(tdp_ptep_t sptep) return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep))); } +static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + + if (root) + return READ_ONCE(root->has_mapped_host_mmio); + + return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio); +} + static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 21a3b8166242..7f3d7229b2c1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -378,7 +378,7 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, /* Zapping leaf spte is allowed only when write lock is held. */ lockdep_assert_held_write(&kvm->mmu_lock); /* Because write lock is held, operation should success. */ - ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn); + ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); KVM_BUG_ON(ret, kvm); } @@ -485,8 +485,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } if (is_mirror_sp(sp) && - WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level, - sp->external_spt))) { + WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, + sp->external_spt))) { /* * Failed to free page table page in mirror page table and * there is nothing to do further. @@ -538,12 +538,12 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * external page table, or leaf. */ if (is_leaf) { - ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn); + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); } else { void *external_spt = get_external_spt(gfn, new_spte, level); KVM_BUG_ON(!external_spt, kvm); - ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); } if (ret) __kvm_tdp_mmu_write_spte(sptep, old_spte); @@ -1153,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; - if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) - return RET_PF_SPURIOUS; - if (is_shadow_present_pte(iter->old_spte) && - is_access_allowed(fault, iter->old_spte) && - is_last_spte(iter->old_spte, iter->level)) + (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + is_last_spte(iter->old_spte, iter->level)) { + WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); return RET_PF_SPURIOUS; + } if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); @@ -1630,21 +1629,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * All TDP MMU shadow pages share the same role as their root, aside * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; } static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; rcu_read_lock(); @@ -1689,8 +1688,8 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1911,16 +1910,13 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1929,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index fde0ae986003..c53b92379e6e 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -52,6 +52,10 @@ /* CPUID level 0x80000022 (EAX) */ #define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) +/* CPUID level 0x80000021 (ECX) */ +#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1) +#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2) + struct cpuid_reg { u32 function; u32 index; @@ -82,6 +86,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, + [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, }; /* @@ -121,6 +126,8 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); + KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); + KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); default: return x86_feature; } diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index a1cf2ac5bd78..551703fbe200 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -142,6 +142,9 @@ union kvm_smram { static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { + if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE)) + return -ENOTTY; + kvm_make_request(KVM_REQ_SMI, vcpu); return 0; } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 067f8e3f5a0d..a34c5c3b164e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -18,6 +18,7 @@ #include <linux/hashtable.h> #include <linux/amd-iommu.h> #include <linux/kvm_host.h> +#include <linux/kvm_irqfd.h> #include <asm/irq_remapping.h> #include <asm/msr.h> @@ -29,36 +30,39 @@ #include "svm.h" /* - * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, - * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry - * if an interrupt can't be delivered, e.g. because the vCPU isn't running. + * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that + * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't + * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index + * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast + * lookup on the index, where as vCPUs whose index doesn't match their ID need + * to walk the entire xarray of vCPUs in the worst case scenario. * - * For the vCPU ID, use however many bits are currently allowed for the max + * For the vCPU index, use however many bits are currently allowed for the max * guest physical APIC ID (limited by the size of the physical ID table), and * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the * size of the GATag is defined by hardware (32 bits), but is an opaque value * as far as hardware is concerned. */ -#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK +#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) -#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ - ((vcpu_id) & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG(vm_id, vcpu_id) \ +#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ + ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) +#define AVIC_GATAG(vm_id, vcpu_idx) \ ({ \ - u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ \ - WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ ga_tag; \ }) -static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); +static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; - static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag) struct kvm_svm *kvm_svm; struct kvm_vcpu *vcpu = NULL; u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - trace_kvm_avic_ga_log(vm_id, vcpu_id); + pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); + trace_kvm_avic_ga_log(vm_id, vcpu_idx); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { if (kvm_svm->avic_vm_id != vm_id) continue; - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); + vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); break; } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm) if (!enable_apicv) return; - if (kvm_svm->avic_logical_id_table_page) - __free_page(kvm_svm->avic_logical_id_table_page); - if (kvm_svm->avic_physical_id_table_page) - __free_page(kvm_svm->avic_physical_id_table_page); + free_page((unsigned long)kvm_svm->avic_logical_id_table); + free_page((unsigned long)kvm_svm->avic_physical_id_table); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm) int err = -ENOMEM; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); struct kvm_svm *k2; - struct page *p_page; - struct page *l_page; u32 vm_id; if (!enable_apicv) return 0; - /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!p_page) + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_physical_id_table) goto free_avic; - kvm_svm->avic_physical_id_table_page = p_page; - - /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!l_page) + kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_logical_id_table) goto free_avic; - kvm_svm->avic_logical_id_table_page = l_page; - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; @@ -242,17 +228,19 @@ free_avic: return err; } +static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) +{ + return __sme_set(__pa(svm->vcpu.arch.apic->regs)); +} + void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); + vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); + vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) avic_activate_vmcb(svm); @@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) avic_deactivate_vmcb(svm); } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, - unsigned int index) -{ - u64 *avic_physical_id_table; - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - - if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || - (index > X2AVIC_MAX_PHYSICAL_ID)) - return NULL; - - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); - - return &avic_physical_id_table[index]; -} - static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - u64 *entry, new_entry; - int id = vcpu->vcpu_id; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); + u32 id = vcpu->vcpu_id; + u64 new_entry; + /* + * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC + * hardware. Immediately clear apicv_active, i.e. don't wait until the + * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as + * avic_vcpu_load() expects to be called if and only if the vCPU has + * fully initialized AVIC. + */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) - return -EINVAL; + (id > X2AVIC_MAX_PHYSICAL_ID)) { + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); + vcpu->arch.apic->apicv_active = false; + return 0; + } + + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); - if (!vcpu->arch.apic->regs) + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { @@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); + /* Note, fls64() returns the bit position, +1. */ + BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > + fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); /* Setting AVIC backing page address in the phy APIC ID table */ - entry = avic_get_physical_id_entry(vcpu, id); - if (!entry) - return -EINVAL; + new_entry = avic_get_backing_page_address(svm) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + svm->avic_physical_id_entry = new_entry; - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); - WRITE_ONCE(*entry, new_entry); - - svm->avic_physical_id_cache = entry; + /* + * Initialize the real table, as vCPUs must have a valid entry in order + * for broadcast IPIs to function correctly (broadcast IPIs ignore + * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). + */ + WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); return 0; } @@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) avic_logical_id_table = NULL; else - avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); + avic_logical_id_table = kvm_svm->avic_logical_id_table; /* * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical @@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - u32 *logical_apic_id_table; u32 cluster, index; ldr = GET_APIC_LOGICAL_ID(ldr); @@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return NULL; index += (cluster << 2); - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); - - return &logical_apic_id_table[index]; + return &kvm_svm->avic_logical_id_table[index]; } static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) @@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (ret) return ret; - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); svm->dfr_reg = APIC_DFR_FLAT; return ret; @@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { - int ret = 0; + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; + if (!vcpu) + return; - list_for_each_entry(ir, &svm->ir_list, node) { - if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); - else - ret = amd_iommu_deactivate_guest_mode(ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; + spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + list_del(&irqfd->vcpu_list); + spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector) { - unsigned long flags; - struct amd_svm_iommu_ir *cur; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - u64 entry; - - if (WARN_ON_ONCE(!pi->ir_data)) - return -EINVAL; - - /** - * In some cases, the existing irte is updated and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->prev_ga_tag) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } - - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - /* - * Update the target pCPU for IOMMU doorbells if the vCPU is running. - * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM - * will update the pCPU info when the vCPU awkened and/or scheduled in. - * See also avic_vcpu_load(). + * If the IRQ was affined to a different vCPU, remove the IRTE metadata + * from the *previous* vCPU's list. */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - true, pi->ir_data); - - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} + svm_ir_list_del(irqfd); -/* - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) -{ - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); - vcpu_info->vector = irq.vector; - - return 0; -} - -/* - * avic_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - bool enable_remapped_mode = true; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; + if (vcpu) { + /* + * Try to enable guest_mode in IRTE, unless AVIC is inhibited, + * in which case configure the IRTE for legacy mode, but track + * the IRTE metadata so that it can be converted to guest mode + * if AVIC is enabled/uninhibited in the future. + */ + struct amd_iommu_pi_data pi_data = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + vcpu->vcpu_idx), + .is_guest_mode = kvm_vcpu_apicv_active(vcpu), + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, + }; + struct vcpu_svm *svm = to_svm(vcpu); + u64 entry; + int ret; - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; + /* + * Prevent the vCPU from being scheduled out or migrated until + * the IRTE is updated and its metadata has been added to the + * list of IRQs being posted to the vCPU, to ensure the IRTE + * isn't programmed with stale pCPU/IsRunning information. + */ + guard(spinlock_irqsave)(&svm->ir_list_lock); - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtualization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - enable_remapped_mode = false; - - /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & - AVIC_HPA_MASK); - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + } else { + pi_data.cpu = -1; + pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; } - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, - e->gsi, vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } + ret = irq_set_vcpu_affinity(host_irq, &pi_data); + if (ret) + return ret; - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; + /* + * Revert to legacy mode if the IOMMU didn't provide metadata + * for the IRTE, which KVM needs to keep the IRTE up-to-date, + * e.g. if the vCPU is migrated or AVIC is disabled. + */ + if (WARN_ON_ONCE(!pi_data.ir_data)) { + irq_set_vcpu_affinity(host_irq, NULL); + return -EIO; } - } - ret = 0; - if (enable_remapped_mode) { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; + irqfd->irq_bypass_data = pi_data.ir_data; + list_add(&irqfd->vcpu_list, &svm->ir_list); + return 0; + } + return irq_set_vcpu_affinity(host_irq, NULL); +} - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); +enum avic_vcpu_action { + /* + * There is no need to differentiate between activate and deactivate, + * as KVM only refreshes AVIC state when the vCPU is scheduled in and + * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is + * being (de)activated. + */ + AVIC_TOGGLE_ON_OFF = BIT(0), + AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, + AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; + /* + * No unique action is required to deal with a vCPU that stops/starts + * running. A vCPU that starts running by definition stops blocking as + * well, and a vCPU that stops running can't have been blocking, i.e. + * doesn't need to toggle GALogIntr. + */ + AVIC_START_RUNNING = 0, + AVIC_STOP_RUNNING = 0, - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} + /* + * When a vCPU starts blocking, KVM needs to set the GALogIntr flag + * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is + * sent to the vCPU. + */ + AVIC_START_BLOCKING = BIT(1), +}; -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) +static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - int ret = 0; - struct amd_svm_iommu_ir *ir; + bool ga_log_intr = (action & AVIC_START_BLOCKING); struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; lockdep_assert_held(&svm->ir_list_lock); - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. */ if (list_empty(&svm->ir_list)) - return 0; + return; - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - return ret; + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; + + if (!(action & AVIC_TOGGLE_ON_OFF)) + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); + else if (cpu >= 0) + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); + else + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } - return 0; } -void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { - u64 entry; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry; lockdep_assert_preemption_disabled(); if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - /* - * No need to update anything if the vCPU is blocking, i.e. if the vCPU - * is being scheduled in after being preempted. The CPU entries in the - * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. - * If the vCPU was migrated, its new CPU value will be stuffed when the - * vCPU unblocks. - */ - if (kvm_vcpu_is_blocking(vcpu)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | + AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + svm->avic_physical_id_entry = entry; + + /* + * If IPI virtualization is disabled, clear IsRunning when updating the + * actual Physical ID table, so that the CPU never sees IsRunning=1. + * Keep the APIC ID up-to-date in the entry to minimize the chances of + * things going sideways if hardware peeks at the ID. + */ + if (!enable_ipiv) + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); spin_unlock_irqrestore(&svm->ir_list_lock, flags); } -void avic_vcpu_put(struct kvm_vcpu *vcpu) +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - u64 entry; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + + __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); +} + +static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry = svm->avic_physical_id_entry; lockdep_assert_preemption_disabled(); - /* - * Note, reading the Physical ID entry outside of ir_list_lock is safe - * as only the pCPU that has loaded (or is loading) the vCPU is allowed - * to modify the entry, and preemption is disabled. I.e. the vCPU - * can't be scheduled out and thus avic_vcpu_{put,load}() can't run - * recursively. - */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + avic_update_iommu_vcpu_affinity(vcpu, -1, action); + + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); + /* + * Keep the previous APIC ID in the entry so that a rogue doorbell from + * hardware is at least restricted to a CPU associated with the vCPU. + */ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + + if (enable_ipiv) + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + + /* + * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as + * it's a synthetic flag that usurps an unused should-be-zero bit. + */ + if (action & AVIC_START_BLOCKING) + entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; + + svm->avic_physical_id_entry = entry; spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ + u64 entry = to_svm(vcpu)->avic_physical_id_entry; + + /* + * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the + * vCPU is preempted while its in the process of blocking. WARN if the + * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put + * the AVIC if it wasn't previously loaded. + */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { + if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) + return; + /* + * The vCPU was preempted while blocking, ensure its IRTEs are + * configured to generate GA Log Interrupts. + */ + if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) + return; + } + + __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : + AVIC_STOP_RUNNING); } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { - bool activated = kvm_vcpu_apicv_active(vcpu); - if (!enable_apicv) return; + /* APICv should only be toggled on/off while the vCPU is running. */ + WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); + avic_refresh_virtual_apic_mode(vcpu); - if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); + if (kvm_vcpu_apicv_active(vcpu)) + __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); else - avic_vcpu_put(vcpu); - - avic_set_pi_irte_mode(vcpu, activated); + __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); } void avic_vcpu_blocking(struct kvm_vcpu *vcpu) @@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - /* - * Unload the AVIC when the vCPU is about to block, _before_ - * the vCPU actually blocks. - * - * Any IRQs that arrive before IsRunning=0 will not cause an - * incomplete IPI vmexit on the source, therefore vIRR will also - * be checked by kvm_vcpu_check_block() before blocking. The - * memory barrier implicit in set_current_state orders writing - * IsRunning=0 before reading the vIRR. The processor needs a - * matching memory barrier on interrupt delivery between writing - * IRR and reading IsRunning; the lack of this barrier might be - * the cause of errata #1235). - */ - avic_vcpu_put(vcpu); + /* + * Unload the AVIC when the vCPU is about to block, _before_ the vCPU + * actually blocks. + * + * Note, any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles + * this by checking vIRR one last time before blocking. The memory + * barrier implicit in set_current_state orders writing IsRunning=0 + * before reading the vIRR. The processor needs a matching memory + * barrier on interrupt delivery between writing IRR and reading + * IsRunning; the lack of this barrier might be the cause of errata #1235). + * + * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM + * doesn't need to detect events for scheduling purposes. The doorbell + * used to signal running vCPUs cannot be blocked, i.e. will perturb the + * CPU and cause noisy neighbor problems if the VM is sending interrupts + * to the vCPU while it's scheduled out. + */ + __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) @@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void) if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + /* + * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) + * due to erratum 1235, which results in missed VM-Exits on the sender + * and thus missed wake events for blocking vCPUs due to the CPU + * failing to see a software update to clear IsRunning. + */ + enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); return true; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 834b67672d50..b7fd2e869998 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm) } /* + * This array (and its actual size) holds the set of offsets (indexing by chunk + * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the + * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g. + * based on CPUID features. This array only tracks MSRs that *might* be passed + * through to the guest. + * + * Hardcode the capacity of the array based on the maximum number of _offsets_. + * MSRs are batched together, so there are fewer offsets than MSRs. + */ +static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; +typedef unsigned long nsvm_msrpm_merge_t; + +int __init nested_svm_init_msrpm_merge_offsets(void) +{ + static const u32 merge_msrs[] __initconst = { + MSR_STAR, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_EIP, + MSR_IA32_SYSENTER_ESP, + #ifdef CONFIG_X86_64 + MSR_GS_BASE, + MSR_FS_BASE, + MSR_KERNEL_GS_BASE, + MSR_LSTAR, + MSR_CSTAR, + MSR_SYSCALL_MASK, + #endif + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_FLUSH_CMD, + MSR_IA32_APERF, + MSR_IA32_MPERF, + MSR_IA32_LASTBRANCHFROMIP, + MSR_IA32_LASTBRANCHTOIP, + MSR_IA32_LASTINTFROMIP, + MSR_IA32_LASTINTTOIP, + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) { + int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]); + u32 offset; + + if (WARN_ON(bit_nr < 0)) + return -EIO; + + /* + * Merging is done in chunks to reduce the number of accesses + * to L1's bitmap. + */ + offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t); + + for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) { + if (nested_svm_msrpm_merge_offsets[j] == offset) + break; + } + + if (j < nested_svm_nr_msrpm_merge_offsets) + continue; + + if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets))) + return -EIO; + + nested_svm_msrpm_merge_offsets[j] = offset; + nested_svm_nr_msrpm_merge_offsets++; + } + + return 0; +} + +/* * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function * is optimized in that it only merges the parts where KVM MSR permission bitmap * may contain zero bits. */ -static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) +static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm; + nsvm_msrpm_merge_t *msrpm01 = svm->msrpm; int i; /* @@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!svm->nested.force_msr_bitmap_recalc) { struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; - if (kvm_hv_hypercall_enabled(&svm->vcpu) && + if (kvm_hv_hypercall_enabled(vcpu) && hve->hv_enlightenments_control.msr_bitmap && (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) goto set_msrpm_base_pa; @@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; + for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) { + const int p = nested_svm_msrpm_merge_offsets[i]; + nsvm_msrpm_merge_t l1_val; + gpa_t gpa; - /* x2apic msrs are intercepted always for the nested guest */ - if (is_x2apic_msrpm_offset(p)) - continue; - - offset = svm->nested.ctl.msrpm_base_pa + (p * 4); + gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val)); - if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) + if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val))) return false; - svm->nested.msrpm[p] = svm->msrpm[p] | value; + msrpm02[p] = msrpm01[p] | l1_val; } svm->nested.force_msr_bitmap_recalc = false; @@ -678,6 +745,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -910,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; - if (nested_svm_vmrun_msrpm(svm)) + if (nested_svm_merge_msrpm(vcpu)) goto out; out_exit_err: @@ -1039,8 +1133,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1194,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm) svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) goto err_free_vmcb02; - svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; @@ -1254,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 offset, msr, value; - int write, mask; + gpa_t base = svm->nested.ctl.msrpm_base_pa; + int write, bit_nr; + u8 value, mask; + u32 msr; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); + bit_nr = svm_msrpm_bit_nr(msr); write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); - if (offset == MSR_INVALID) + if (bit_nr < 0) return NESTED_EXIT_DONE; - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; - - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE, + &value, sizeof(value))) return NESTED_EXIT_DONE; + mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1)); return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } @@ -1783,13 +1885,11 @@ out_free: static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - if (WARN_ON(!is_guest_mode(vcpu))) return true; if (!vcpu->arch.pdptrs_from_userspace && - !nested_npt_enabled(svm) && is_pae_paging(vcpu)) + !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu)) /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested @@ -1798,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; - if (!nested_svm_vmrun_msrpm(svm)) { + if (!nested_svm_merge_msrpm(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 1aa0f07d3a63..2fbdebf79fbb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) */ down_write(&sev_deactivate_lock); + /* SNP firmware requires use of WBINVD for ASID recycling. */ wbinvd_on_all_cpus(); if (sev_snp_enabled) @@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, init_args.probe = false; ret = sev_platform_init(&init_args); if (ret) - goto e_free; + goto e_free_asid; + + if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto e_free_asid; + } /* This needs to happen after SEV/SNP firmware initialization. */ if (vm_type == KVM_X86_SNP_VM) { @@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, return 0; e_free: + free_cpumask_var(sev->have_run_cpus); +e_free_asid: argp->error = init_args.error; sev_asid_free(sev); sev->asid = 0; @@ -561,6 +569,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -706,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) } } +static void sev_writeback_caches(struct kvm *kvm) +{ + /* + * Note, the caller is responsible for ensuring correctness if the mask + * can be modified, e.g. if a CPU could be doing VMRUN. + */ + if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus)) + return; + + /* + * Ensure that all dirty guest tagged cache entries are written back + * before releasing the pages back to the system for use. CLFLUSH will + * not do this without SME_COHERENT, and flushing many cache lines + * individually is slower than blasting WBINVD for large VMs, so issue + * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) + * on CPUs that have done VMRUN, i.e. may have dirtied data using the + * VM's ASID. + * + * For simplicity, never remove CPUs from the bitmap. Ideally, KVM + * would clear the mask when flushing caches, but doing so requires + * serializing multiple calls and having responding CPUs (to the IPI) + * mark themselves as still running if they are running (or about to + * run) a vCPU for the VM. + */ + wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); +} + static unsigned long get_num_contig_pages(unsigned long idx, struct page **inpages, unsigned long npages) { @@ -1593,11 +1630,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1883,70 +1920,6 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); @@ -2033,6 +2006,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) struct kvm_vcpu *src_vcpu; unsigned long i; + if (src->created_vcpus != atomic_read(&src->online_vcpus) || + dst->created_vcpus != atomic_read(&dst->online_vcpus)) + return -EBUSY; + if (!sev_es_guest(src)) return 0; @@ -2084,10 +2061,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -2095,15 +2072,26 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) if (ret) goto out_source_vcpu; + /* + * Allocate a new have_run_cpus for the destination, i.e. don't copy + * the set of CPUs from the source. If a CPU was used to run a vCPU in + * the source VM but is never used for the destination VM, then the CPU + * can only have cached memory that was accessible to the source VM. + */ + if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto out_source_vcpu; + } + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -2193,12 +2181,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; /* Check for policy bits that must be set */ - if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || - !(params.policy & SNP_POLICY_MASK_SMT)) + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) return -EINVAL; - if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) - return -EINVAL; + sev->policy = params.policy; sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) @@ -2754,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, goto failed; } - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); + sev_writeback_caches(kvm); __unregister_enc_region_locked(kvm, region); @@ -2801,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) goto e_unlock; } + mirror_sev = to_kvm_sev_info(kvm); + if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { + ret = -ENOMEM; + goto e_unlock; + } + /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2869,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm) WARN_ON(!list_empty(&sev->mirror_vms)); - /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ + free_cpumask_var(sev->have_run_cpus); + + /* + * If this is a mirror VM, remove it from the owner's list of a mirrors + * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). + * Note, mirror VMs don't support registering encrypted regions. + */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; @@ -2880,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm) return; } - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); /* * if userspace was terminated before unregistering the memory regions @@ -2931,6 +2917,33 @@ void __init sev_set_cpu_caps(void) } } +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; @@ -3035,6 +3048,14 @@ void __init sev_hardware_setup(void) sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : @@ -3061,15 +3082,6 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; - - if (!sev_enabled) - return; - - /* - * Do both SNP and SEV initialization at KVM module load. - */ - init_args.probe = true; - sev_platform_init(&init_args); } void sev_hardware_unsetup(void) @@ -3129,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) /* * VM Page Flush takes a host virtual address and a guest ASID. Fall - * back to WBINVD if this faults so as not to make any problems worse - * by leaving stale encrypted data in the cache. + * back to full writeback of caches if this faults so as not to make + * any problems worse by leaving stale encrypted data in the cache. */ if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) - goto do_wbinvd; + goto do_sev_writeback_caches; return; -do_wbinvd: - wbinvd_on_all_cpus(); +do_sev_writeback_caches: + sev_writeback_caches(vcpu->kvm); } void sev_guest_memory_reclaimed(struct kvm *kvm) { /* * With SNP+gmem, private/encrypted memory is unreachable via the - * hva-based mmu notifiers, so these events are only actually - * pertaining to shared pages where there is no need to perform - * the WBINVD to flush associated caches. + * hva-based mmu notifiers, i.e. these events are explicitly scoped to + * shared pages, where there's no need to flush caches. */ if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; - wbinvd_on_all_cpus(); + sev_writeback_caches(kvm); } void sev_free_vcpu(struct kvm_vcpu *vcpu) @@ -3484,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu) if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) return -EINVAL; + /* + * To optimize cache flushes when memory is reclaimed from an SEV VM, + * track physical CPUs that enter the guest for SEV VMs and thus can + * have encrypted, dirty data in the cache, and flush caches only for + * CPUs that have entered the guest. + */ + if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) + cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); + /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3916,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) * From this point forward, the VMSA will always be a guest-mapped page * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but - * that involves cleanups like wbinvd_on_all_cpus() which would ideally - * be handled during teardown rather than guest boot. Deferring that - * also allows the existing logic for SEV-ES VMSAs to be re-used with + * that involves cleanups like flushing caches, which would ideally be + * handled during teardown rather than guest boot. Deferring that also + * allows the existing logic for SEV-ES VMSAs to be re-used with * minimal SNP-specific changes. */ svm->sev_es.snp_has_guest_vmsa = true; @@ -4007,10 +4027,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) * Unless Creation is deferred until INIT, signal the vCPU to update * its state. */ - if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); return 0; } @@ -4422,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) count, in); } -static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) +void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + /* Clear intercepts on MSRs that are context switched by hardware. */ + svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); - - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); - } + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) + svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if @@ -4445,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * XSAVES being exposed to the guest so that KVM can at least honor * guest CPUID for RDMSR and WRMSR. */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); - else - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); + svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); } void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) @@ -4461,15 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); - - if (sev_es_guest(svm->vcpu.kvm)) - sev_es_vcpu_after_set_cpuid(svm); } static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; - struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; @@ -4480,8 +4494,16 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (!svm->sev_es.snp_has_guest_vmsa) { + if (svm->sev_es.vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + else + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + } + + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -4519,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); - - /* Clear intercepts on selected MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) @@ -4911,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) /* * SEV-ES avoids host/guest cache coherency issues through - * WBINVD hooks issued via MMU notifiers during run-time, and + * WBNOINVD hooks issued via MMU notifiers during run-time, and * KVM's VM destroy path at shutdown. Those MMU notifier events * don't cover gmem since there is no requirement to map pages * to a HVA in order to use them for a running guest. While the @@ -4943,3 +4961,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 67fee545d42a..d9931c6c4bc6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -29,6 +29,7 @@ #include <linux/cc_platform.h> #include <linux/smp.h> #include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> #include <asm/msr.h> @@ -71,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); static bool erratum_383_found __read_mostly; -u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - /* * Set osvw_len to higher value when updated Revision Guides * are published and we know what the new status bits are @@ -81,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) - -static const struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is initially cleared */ -} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, - { .index = MSR_IA32_SYSENTER_EIP, .always = false }, - { .index = MSR_IA32_SYSENTER_ESP, .always = false }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_SPEC_CTRL, .always = false }, - { .index = MSR_IA32_PRED_CMD, .always = false }, - { .index = MSR_IA32_FLUSH_CMD, .always = false }, - { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_IA32_XSS, .always = false }, - { .index = MSR_EFER, .always = false }, - { .index = MSR_IA32_CR_PAT, .always = false }, - { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, - { .index = MSR_TSC_AUX, .always = false }, - { .index = X2APIC_MSR(APIC_ID), .always = false }, - { .index = X2APIC_MSR(APIC_LVR), .always = false }, - { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, - { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, - { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, - { .index = X2APIC_MSR(APIC_EOI), .always = false }, - { .index = X2APIC_MSR(APIC_RRR), .always = false }, - { .index = X2APIC_MSR(APIC_LDR), .always = false }, - { .index = X2APIC_MSR(APIC_DFR), .always = false }, - { .index = X2APIC_MSR(APIC_SPIV), .always = false }, - { .index = X2APIC_MSR(APIC_ISR), .always = false }, - { .index = X2APIC_MSR(APIC_TMR), .always = false }, - { .index = X2APIC_MSR(APIC_IRR), .always = false }, - { .index = X2APIC_MSR(APIC_ESR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR), .always = false }, - { .index = X2APIC_MSR(APIC_ICR2), .always = false }, - - /* - * Note: - * AMD does not virtualize APIC TSC-deadline timer mode, but it is - * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, - * the AVIC hardware would generate GP fault. Therefore, always - * intercept the MSR 0x832, and do not setup direct_access_msr. - */ - { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, - { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, - { .index = X2APIC_MSR(APIC_LVT0), .always = false }, - { .index = X2APIC_MSR(APIC_LVT1), .always = false }, - { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, - { .index = X2APIC_MSR(APIC_TMICT), .always = false }, - { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, - { .index = X2APIC_MSR(APIC_TDCR), .always = false }, - { .index = MSR_INVALID, .always = false }, -}; - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -231,6 +164,9 @@ module_param(tsc_scaling, int, 0444); */ static bool avic; module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); + +module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -250,6 +186,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -259,33 +197,6 @@ DEFINE_PER_CPU(struct svm_cpu_data, svm_data); */ static int tsc_aux_uret_slot __read_mostly = -1; -static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } - - /* MSR not in any range */ - return MSR_INVALID; -} - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -752,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) recalc_intercepts(svm); } -static int direct_access_msr_slot(u32 msr) -{ - u32 i; - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == msr) - return i; - - return -ENOENT; -} - -static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, - int write) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int slot = direct_access_msr_slot(msr); - - if (slot == -ENOENT) - return; - - /* Set the shadow bitmaps to the desired intercept states */ - if (read) - set_bit(slot, svm->shadow_msr_intercept.read); - else - clear_bit(slot, svm->shadow_msr_intercept.read); - - if (write) - set_bit(slot, svm->shadow_msr_intercept.write); - else - clear_bit(slot, svm->shadow_msr_intercept.write); -} - -static bool valid_msr_intercept(u32 index) -{ - return direct_access_msr_slot(index) != -ENOENT; -} - static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { - u8 bit_write; - unsigned long tmp; - u32 offset; - u32 *msrpm; - /* * For non-nested case: * If the L01 MSR bitmap does not intercept the MSR, then we need to @@ -805,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: - to_svm(vcpu)->msrpm; - - offset = svm_msrpm_offset(msr); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; + void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : + to_svm(vcpu)->msrpm; - BUG_ON(offset == MSR_INVALID); - - return test_bit(bit_write, &tmp); + return svm_test_msr_bitmap_write(msrpm, msr); } -static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, - u32 msr, int read, int write) +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_svm *svm = to_svm(vcpu); - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); - - /* Enforce non allowed MSRs to trap */ - if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) - read = 0; + void *msrpm = svm->msrpm; - if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) - write = 0; - - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + /* Don't disable interception for MSRs userspace wants to handle. */ + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + svm_clear_msr_bitmap_read(msrpm, msr); + else + svm_set_msr_bitmap_read(msrpm, msr); + } - msrpm[offset] = tmp; + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + svm_clear_msr_bitmap_write(msrpm, msr); + else + svm_set_msr_bitmap_write(msrpm, msr); + } svm_hv_vmcb_dirty_nested_enlightenments(vcpu); svm->nested.force_msr_bitmap_recalc = true; } -void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) -{ - set_shadow_msr_intercept(vcpu, msr, read, write); - set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); -} - -u32 *svm_vcpu_alloc_msrpm(void) +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) { - unsigned int order = get_order(MSRPM_SIZE); - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); - u32 *msrpm; + unsigned int order = get_order(size); + struct page *pages = alloc_pages(gfp_mask, order); + void *pm; if (!pages) return NULL; - msrpm = page_address(pages); - memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); + /* + * Set all bits in the permissions map so that all MSR and I/O accesses + * are intercepted by default. + */ + pm = page_address(pages); + memset(pm, 0xff, PAGE_SIZE * (1 << order)); - return msrpm; + return pm; } -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - int i; + bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); - } + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); + + if (sev_es_guest(vcpu->kvm)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) { + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; int i; if (intercept == svm->x2avic_msrs_intercepted) @@ -897,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (!x2avic_enabled) return; - for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { - int index = direct_access_msrs[i].index; - - if ((index < APIC_BASE_MSR) || - (index > APIC_BASE_MSR + 0xff)) - continue; - set_msr_interception(&svm->vcpu, svm->msrpm, index, - !intercept, !intercept); - } + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); svm->x2avic_msrs_intercepted = intercept; } -void svm_vcpu_free_msrpm(u32 *msrpm) +void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } -static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u32 i; - /* - * Set intercept permissions for all direct access MSRs again. They - * will automatically get filtered through the MSR filter, so we are - * back in sync after this. - */ - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 msr = direct_access_msrs[i].index; - u32 read = test_bit(i, svm->shadow_msr_intercept.read); - u32 write = test_bit(i, svm->shadow_msr_intercept.write); + svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); - } -} - -static void add_msr_offset(u32 offset) -{ - int i; - - for (i = 0; i < MSRPM_OFFSETS; ++i) { - - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) - return; +#ifdef CONFIG_X86_64 + svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); + svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); +#endif - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; + if (lbrv) + svm_recalc_lbr_msr_intercepts(vcpu); - /* Add offset to list */ - msrpm_offsets[i] = offset; + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); - return; - } + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. + * Disable interception of SPEC_CTRL if KVM doesn't need to manually + * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if + * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively + * using SPEC_CTRL. */ - BUG(); -} - -static void init_msrpm_offsets(void) -{ - int i; - - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !guest_has_spec_ctrl_msr(vcpu)); + else + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !svm->spec_ctrl); - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; + /* + * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, + * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. + */ + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, + guest_cpuid_is_intel_compatible(vcpu)); + + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); + if (sev_es_guest(vcpu->kvm)) + sev_es_recalc_msr_intercepts(vcpu); - add_msr_offset(offset); - } + /* + * x2APIC intercepts are modified on-demand and cannot be filtered by + * userspace. + */ } void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -993,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (sev_es_guest(vcpu->kvm)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + svm_recalc_lbr_msr_intercepts(vcpu); /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) @@ -1011,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + svm_recalc_lbr_msr_intercepts(vcpu); /* * Move the LBR msrs back to the vmcb01 to avoid copying them @@ -1171,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) } /* Evaluate instruction intercepts that depend on guest CPUID features. */ -static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow * roots, or if INVPCID is disabled in the guest to inject #UD. @@ -1192,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, else svm_set_intercept(svm, INTERCEPT_RDTSCP); } -} - -static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); if (guest_cpuid_is_intel_compatible(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1220,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); } } +static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + svm_recalc_instruction_intercepts(vcpu); + svm_recalc_msr_intercepts(vcpu); +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1348,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_recalc_instruction_intercepts(vcpu, svm); - - /* - * If the host supports V_SPEC_CTRL then disable the interception - * of MSR_IA32_SPEC_CTRL. - */ - if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); - if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -1369,11 +1217,15 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); svm_hv_init_vmcb(vmcb); - init_vmcb_after_set_cpuid(vcpu); + + svm_recalc_intercepts_after_set_cpuid(vcpu); vmcb_mark_all_dirty(vmcb); @@ -1384,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm_init_osvw(vcpu); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) @@ -1478,24 +1328,11 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); + WARN_ON_ONCE(!list_empty(&svm->ir_list)); svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1503,7 +1340,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) sev_free_vcpu(vcpu); __free_page(__sme_pa_to_page(svm->vmcb01.pa)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + svm_vcpu_free_msrpm(svm->msrpm); } #ifdef CONFIG_CPU_MITIGATIONS @@ -1610,19 +1447,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && - static_branch_likely(&switch_vcpu_ibpb)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -2897,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data) return 0; } -static bool -sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info) { return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && - svm_msrpm_offset(msr_info->index) != MSR_INVALID && !msr_write_intercepted(vcpu, msr_info->index); } @@ -3133,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_svm_vmrun_msrpm. + * nested_svm_merge_msrpm(). * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -3203,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) /* * TSC_AUX is usually changed only during boot and never read - * directly. Intercept TSC_AUX instead of exposing it to the - * guest via direct_access_msrs, and switch it via user return. + * directly. Intercept TSC_AUX and switch it via user return. */ preempt_disable(); ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); @@ -3221,17 +3046,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } /* - * AMD changed the architectural behavior of bits 5:2. On CPUs - * without BusLockTrap, bits 5:2 control "external pins", but - * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap - * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed - * the guest to set bits 5:2 despite not actually virtualizing - * Performance-Monitoring/Breakpoint external pins. Drop bits - * 5:2 for backwards compatibility. - */ - data &= ~GENMASK(5, 2); - - /* * Suppress BTF as KVM doesn't virtualize BTF, but there's no * way to communicate lack of support to the guest. */ @@ -3361,6 +3175,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3430,6 +3275,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3444,14 +3290,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3489,6 +3342,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3559,6 +3423,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3595,6 +3516,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -4306,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, - bool force_immediate_exit) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); @@ -4355,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, svm_hv_update_vp_id(svm->vmcb, vcpu); /* - * Run with all-zero DR6 unless needed, so that we can get the exact cause - * of a #DB. + * Run with all-zero DR6 unless the guest can write DR6 freely, so that + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from + * KVM's snapshot is only necessary when DR accesses won't exit. */ - if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) + svm_set_dr6(vcpu, vcpu->arch.dr6); + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); @@ -4538,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (guest_cpuid_is_intel_compatible(vcpu)) guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm_recalc_instruction_intercepts(vcpu, svm); - - if (boot_cpu_has(X86_FEATURE_IBPB)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, - !!guest_has_pred_cmd_msr(vcpu)); - - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - init_vmcb_after_set_cpuid(vcpu); + svm_recalc_intercepts_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5102,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm) } if (!pause_filter_count || !pause_filter_thresh) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (enable_apicv) { int ret = avic_vm_init(kvm); @@ -5169,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5254,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .msr_filter_changed = svm_msr_filter_changed, + .recalc_msr_intercepts = svm_recalc_msr_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, @@ -5356,6 +5273,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5387,11 +5307,8 @@ static __init void svm_set_cpu_caps(void) static __init int svm_hardware_setup(void) { - int cpu; - struct page *iopm_pages; void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); + int cpu, r; /* * NX is required for shadow paging and for NPT if the NX huge pages @@ -5403,17 +5320,6 @@ static __init int svm_hardware_setup(void) } kvm_enable_efer_bits(EFER_NX); - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = __sme_page_pa(iopm_pages); - - init_msrpm_offsets(); - kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); @@ -5447,6 +5353,10 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + + r = nested_svm_init_msrpm_merge_offsets(); + if (r) + return r; } /* @@ -5478,6 +5388,13 @@ static __init int svm_hardware_setup(void) else pr_info("LBR virtualization supported\n"); } + + iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); + if (!iopm_va) + return -ENOMEM; + + iopm_base = __sme_set(__pa(iopm_va)); + /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5495,6 +5412,7 @@ static __init int svm_hardware_setup(void) enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { + enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; @@ -5551,6 +5469,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: @@ -5575,6 +5494,8 @@ static int __init svm_init(void) { int r; + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); + __unused_size_checks(); if (!kvm_is_svm_supported()) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f16b068c4228..58b9d168e0c8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa) #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 48 -#define MSRPM_OFFSETS 32 -extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int nrips; extern int vgif; @@ -98,6 +95,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -112,15 +110,19 @@ struct kvm_sev_info { void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ + cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; /* Struct members for AVIC */ u32 avic_vm_id; - struct page *avic_logical_id_table_page; - struct page *avic_physical_id_table_page; + u32 *avic_logical_id_table; + u64 *avic_physical_id_table; struct hlist_node hnode; struct kvm_sev_info sev_info; @@ -169,6 +171,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -184,8 +187,11 @@ struct svm_nested_state { u64 vmcb12_gpa; u64 last_vmcb12_gpa; - /* These are the merged vectors */ - u32 *msrpm; + /* + * The MSR permissions map used for vmcb02, which is the merge result + * of vmcb01 and vmcb12 + */ + void *msrpm; /* A VMRUN has started but has not yet been performed, so * we cannot inject a nested vmexit yet. */ @@ -266,7 +272,7 @@ struct vcpu_svm { */ u64 virt_spec_ctrl; - u32 *msrpm; + void *msrpm; ulong nmi_iret_rip; @@ -301,24 +307,26 @@ struct vcpu_svm { u32 ldr_reg; u32 dfr_reg; - struct page *avic_backing_page; - u64 *avic_physical_id_cache; + + /* This is essentially a shadow of the vCPU's actual entry in the + * Physical ID table that is programmed into the VMCB, i.e. that is + * seen by the CPU. If IPI virtualization is disabled, IsRunning is + * only ever set in the shadow, i.e. is never propagated to the "real" + * table, so that hardware never sees IsRunning=1. + */ + u64 avic_physical_id_entry; /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. + * Per-vCPU list of irqfds that are eligible to post IRQs directly to + * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list + * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the + * target pCPU), when AVIC is toggled on/off (to (de)activate bypass), + * and if the irqfd becomes ineligible for posting (to put the IRTE + * back into remapped mode). */ struct list_head ir_list; spinlock_t ir_list_lock; - /* Save desired MSR intercept (read: pass-through) state */ - struct { - DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); - DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); - } shadow_msr_intercept; - struct vcpu_sev_es_state sev_es; bool guest_state_loaded; @@ -340,8 +348,6 @@ struct svm_cpu_data { struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; @@ -610,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); } -/* svm.c */ -#define MSR_INVALID 0xffffffffU +/* + * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range + * is reserved). Each MSR within a range is covered by two bits, one each for + * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted. + */ +#define SVM_MSRPM_BYTES_PER_RANGE 2048 +#define SVM_BITS_PER_MSR 2 +#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR) +#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE) +static_assert(SVM_MSRS_PER_RANGE == 8192); +#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1) + +static __always_inline int svm_msrpm_bit_nr(u32 msr) +{ + int range_nr; + + switch (msr & ~SVM_MSRPM_OFFSET_MASK) { + case 0: + range_nr = 0; + break; + case 0xc0000000: + range_nr = 1; + break; + case 0xc0010000: + range_nr = 2; + break; + default: + return -EINVAL; + } + + return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE + + (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR; +} + +#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \ +static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int bit_nr; \ + \ + bit_nr = svm_msrpm_bit_nr(msr); \ + if (bit_nr < 0) \ + return (rtype)true; \ + \ + return bitop##_bit(bit_nr + bit_rw, bitmap); \ +} + +#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \ + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1) + +BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set) #define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) +/* svm.c */ extern bool dump_invalid_vmcb; -u32 svm_msrpm_offset(u32 msr); -u32 *svm_vcpu_alloc_msrpm(void); -void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); -void svm_vcpu_free_msrpm(u32 *msrpm); +void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask); + +static inline void *svm_vcpu_alloc_msrpm(void) +{ + return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT); +} + +void svm_vcpu_free_msrpm(void *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); @@ -640,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); +void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); + +static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + svm_set_intercept_for_msr(vcpu, msr, type, false); +} + +static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + svm_set_intercept_for_msr(vcpu, msr, type, true); +} + /* nested.c */ #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ @@ -668,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } +int __init nested_svm_init_msrpm_merge_offsets(void); + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); void svm_leave_nested(struct kvm_vcpu *vcpu); @@ -718,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) bool avic_hardware_setup(void); @@ -733,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); @@ -749,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_vcpu_reset(struct vcpu_svm *svm); +void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); @@ -785,6 +867,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -816,6 +900,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 0c61153b275f..235c4af6b692 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -169,6 +169,9 @@ SYM_FUNC_START(__svm_vcpu_run) #endif mov VCPU_RDI(%_ASM_DI), %_ASM_DI + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 3: vmrun %_ASM_AX 4: @@ -335,6 +338,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov SVM_current_vmcb(%rdi), %rax mov KVM_VMCB_pa(%rax), %rax + /* Clobbers EFLAGS.ZF */ + VM_CLEAR_CPU_BUFFERS + /* Enter guest mode */ 1: vmrun %rax 2: diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ba736cbb0587..57d79fd31df0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid, __entry->used_max_basic ? ", used max basic" : "") ); +#define kvm_deliver_mode \ + {0x0, "Fixed"}, \ + {0x1, "LowPrio"}, \ + {0x2, "SMI"}, \ + {0x3, "Res3"}, \ + {0x4, "NMI"}, \ + {0x5, "INIT"}, \ + {0x6, "SIPI"}, \ + {0x7, "ExtINT"} + +#ifdef CONFIG_KVM_IOAPIC +TRACE_EVENT(kvm_ioapic_set_irq, + TP_PROTO(__u64 e, int pin, bool coalesced), + TP_ARGS(e, pin, coalesced), + + TP_STRUCT__entry( + __field( __u64, e ) + __field( int, pin ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->e = e; + __entry->pin = pin; + __entry->coalesced = coalesced; + ), + + TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", + __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "", + __entry->coalesced ? " (coalesced)" : "") +); + +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, + TP_PROTO(__u64 e), + TP_ARGS(e), + + TP_STRUCT__entry( + __field( __u64, e ) + ), + + TP_fast_assign( + __entry->e = e; + ), + + TP_printk("dst %x vec %u (%s|%s|%s%s)", + (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "") +); +#endif + +TRACE_EVENT(kvm_msi_set_irq, + TP_PROTO(__u64 address, __u64 data), + TP_ARGS(address, data), + + TP_STRUCT__entry( + __field( __u64, address ) + __field( __u64, data ) + ), + + TP_fast_assign( + __entry->address = address; + __entry->data = data; + ), + + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, + __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), + (__entry->address & (1<<2)) ? "logical" : "physical", + (__entry->data & (1<<15)) ? "level" : "edge", + (__entry->address & (1<<3)) ? "|rh" : "") +); + #define AREG(x) { APIC_##x, "APIC_" #x } #define kvm_trace_symbol_apic \ @@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition, * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC. */ TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, - unsigned int gsi, unsigned int gvec, - u64 pi_desc_addr, bool set), - TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), + TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu, + unsigned int gsi, unsigned int gvec, bool set), + TP_ARGS(host_irq, vcpu, gsi, gvec, set), TP_STRUCT__entry( __field( unsigned int, host_irq ) - __field( unsigned int, vcpu_id ) + __field( int, vcpu_id ) __field( unsigned int, gsi ) __field( unsigned int, gvec ) - __field( u64, pi_desc_addr ) __field( bool, set ) ), TP_fast_assign( __entry->host_irq = host_irq; - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1; __entry->gsi = gsi; __entry->gvec = gvec; - __entry->pi_desc_addr = pi_desc_addr; __entry->set = set; ), - TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, " - "gvec: 0x%x, pi_desc_addr: 0x%llx", + TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x", __entry->set ? "enabled and being updated" : "disabled", __entry->host_irq, __entry->vcpu_id, __entry->gsi, - __entry->gvec, - __entry->pi_desc_addr) + __entry->gvec) ); /* diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cb6588238f46..5316c27f6099 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; -extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..bc5ece76533a --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include <linux/kvm_host.h> +#include <asm/posted_intr.h> + +#include "mmu.h" + +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +struct vcpu_vt { + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + union vmx_exit_reason exit_reason; + + unsigned long exit_qualification; + u32 exit_intr_info; + + /* + * If true, guest state has been loaded into hardware, and host state + * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into + * hardware. + */ + bool guest_state_loaded; + bool emulation_required; + +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; +#endif +}; + +#ifdef CONFIG_KVM_INTEL_TDX + +static __always_inline bool is_td(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) +{ + return is_td(vcpu->kvm); +} + +#else + +static __always_inline bool is_td(struct kvm *kvm) { return false; } +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } + +#endif + +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. + * + * When the target is not the running vCPU, the following + * possibilities emerge: + * + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. + * + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. + * + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. + */ + + if (vcpu != kvm_get_running_vcpu()) + __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; + } +#endif + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); +} + +/* + * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the + * interrupt if necessary. + */ +static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + struct pi_desc *pi_desc, int vector) +{ + if (pi_test_and_set_pir(vector, pi_desc)) + return; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(pi_desc)) + return; + + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); +} + +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 43ee9ed11291..dbab1c15b0cd 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -3,9 +3,848 @@ #include "x86_ops.h" #include "vmx.h" +#include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" +#include "tdx_arch.h" + +#ifdef CONFIG_KVM_INTEL_TDX +static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); + +static void vt_disable_virtualization_cpu(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + if (enable_tdx) + tdx_disable_virtualization_cpu(); + vmx_disable_virtualization_cpu(); +} + +static __init int vt_hardware_setup(void) +{ + int ret; + + ret = vmx_hardware_setup(); + if (ret) + return ret; + + if (enable_tdx) + tdx_hardware_setup(); + + return 0; +} + +static int vt_vm_init(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_init(kvm); + + return vmx_vm_init(kvm); +} + +static void vt_vm_pre_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_mmu_release_hkid(kvm); +} + +static void vt_vm_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_destroy(kvm); + + vmx_vm_destroy(kvm); +} + +static int vt_vcpu_precreate(struct kvm *kvm) +{ + if (is_td(kvm)) + return 0; + + return vmx_vcpu_precreate(kvm); +} + +static int vt_vcpu_create(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_create(vcpu); + + return vmx_vcpu_create(vcpu); +} + +static void vt_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_free(vcpu); + return; + } + + vmx_vcpu_free(vcpu); +} + +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_reset(vcpu, init_event); + return; + } + + vmx_vcpu_reset(vcpu, init_event); +} + +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_load(vcpu, cpu); + return; + } + + vmx_vcpu_load(vcpu, cpu); +} + +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + +static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_prepare_switch_to_guest(vcpu); + return; + } + + vmx_prepare_switch_to_guest(vcpu); +} + +static void vt_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_put(vcpu); + return; + } + + vmx_vcpu_put(vcpu); +} + +static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_pre_run(vcpu); + + return vmx_vcpu_pre_run(vcpu); +} + +static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_run(vcpu, run_flags); + + return vmx_vcpu_run(vcpu, run_flags); +} + +static int vt_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath) +{ + if (is_td_vcpu(vcpu)) + return tdx_handle_exit(vcpu, fastpath); + + return vmx_handle_exit(vcpu, fastpath); +} + +static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_set_msr(vcpu, msr_info); + + return vmx_set_msr(vcpu, msr_info); +} + +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vt_has_emulated_msr(struct kvm *kvm, u32 index) +{ + if (kvm && is_td(kvm)) + return tdx_has_emulated_msr(index); + + return vmx_has_emulated_msr(kvm, index); +} + +static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_get_msr(vcpu, msr_info); + + return vmx_get_msr(vcpu, msr_info); +} + +static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +{ + /* + * TDX doesn't allow VMM to configure interception of MSR accesses. + * TDX guest requests MSR accesses by calling TDVMCALL. The MSR + * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR + * if the userspace has set any. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_recalc_msr_intercepts(vcpu); +} + +static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (is_td_vcpu(vcpu)) + return tdx_complete_emulated_msr(vcpu, err); + + return vmx_complete_emulated_msr(vcpu, err); +} + +#ifdef CONFIG_KVM_SMM +static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_smi_allowed(vcpu, for_injection); +} + +static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_enter_smm(vcpu, smram); +} + +static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_leave_smm(vcpu, smram); +} + +static void vt_enable_smi_window(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return; + + /* RSM will cause a vmexit anyway. */ + vmx_enable_smi_window(vcpu); +} +#endif + +static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * For TDX, this can only be triggered for MMIO emulation. Let the + * guest retry after installing the SPTE with suppress #VE bit cleared, + * so that the guest will receive #VE when retry. The guest is expected + * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on + * #VE. + */ + if (is_td_vcpu(vcpu)) + return X86EMUL_RETRY_INSTR; + + return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); +} + +static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + /* + * INIT and SIPI are always blocked for TDX, i.e., INIT handling and + * the OP vcpu_deliver_sipi_vector() won't be called. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_apic_init_signal_blocked(vcpu); +} + +static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + /* Only x2APIC mode is supported for TD. */ + if (is_td_vcpu(vcpu)) + return; + + return vmx_set_virtual_apic_mode(vcpu); +} + +static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + if (is_td_vcpu(vcpu)) + return; + + return vmx_hwapic_isr_update(vcpu, max_isr); +} + +static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return -1; + + return vmx_sync_pir_to_irr(vcpu); +} + +static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + if (is_td_vcpu(apic->vcpu)) { + tdx_deliver_interrupt(apic, delivery_mode, trig_mode, + vector); + return; + } + + vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); +} + +static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_vcpu_after_set_cpuid(vcpu); +} + +static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_exception_bitmap(vcpu); +} + +static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_segment_base(vcpu, seg); +} + +static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) { + memset(var, 0, sizeof(*var)); + return; + } + + vmx_get_segment(vcpu, var, seg); +} + +static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_segment(vcpu, var, seg); +} + +static int vt_get_cpl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl(vcpu); +} + +static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl_no_cache(vcpu); +} + +static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + if (is_td_vcpu(vcpu)) { + *db = 0; + *l = 0; + return; + } + + vmx_get_cs_db_l_bits(vcpu, db, l); +} + +static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr0(vcpu, cr0); +} + +static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr0(vcpu, cr0); +} + +static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr4(vcpu, cr4); +} + +static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr4(vcpu, cr4); +} + +static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_set_efer(vcpu, efer); +} + +static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_idt(vcpu, dt); +} + +static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_idt(vcpu, dt); +} + +static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_gdt(vcpu, dt); +} + +static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_gdt(vcpu, dt); +} + +static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr7(vcpu, val); +} + +static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + /* + * MOV-DR exiting is always cleared for TD guest, even in debug mode. + * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never + * reach here for TD vcpu. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_sync_dirty_debug_regs(vcpu); +} + +static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_cache_reg(vcpu, reg); +} + +static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_rflags(vcpu); +} + +static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_rflags(vcpu, rflags); +} + +static bool vt_get_if_flag(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_if_flag(vcpu); +} + +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_all(vcpu); + return; + } + + vmx_flush_tlb_all(vcpu); +} + +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_current(vcpu); + return; + } + + vmx_flush_tlb_current(vcpu); +} + +static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_gva(vcpu, addr); +} + +static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_guest(vcpu); +} + +static void vt_inject_nmi(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_inject_nmi(vcpu); + return; + } + + vmx_inject_nmi(vcpu); +} + +static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + /* + * The TDX module manages NMI windows and NMI reinjection, and hides NMI + * blocking, all KVM can do is throw an NMI over the wall. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_nmi_allowed(vcpu, for_injection); +} + +static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get NMI blocking status for TDX guest, assume NMIs are + * always unmasked. + */ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_nmi_mask(vcpu); +} + +static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_nmi_mask(vcpu, masked); +} + +static void vt_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + /* Refer to the comments in tdx_inject_nmi(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_nmi_window(vcpu); +} + +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + +static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_interrupt_shadow(vcpu, mask); +} + +static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_interrupt_shadow(vcpu); +} + +static void vt_patch_hypercall(struct kvm_vcpu *vcpu, + unsigned char *hypercall) +{ + /* + * Because guest memory is protected, guest can't be patched. TD kernel + * is modified to use TDG.VP.VMCALL for hypercall. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_patch_hypercall(vcpu, hypercall); +} + +static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_irq(vcpu, reinjected); +} + +static void vt_inject_exception(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_exception(vcpu); +} + +static void vt_cancel_injection(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_injection(vcpu); +} + +static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (is_td_vcpu(vcpu)) + return tdx_interrupt_allowed(vcpu); + + return vmx_interrupt_allowed(vcpu, for_injection); +} + +static void vt_enable_irq_window(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_irq_window(vcpu); +} + +static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = 0; + *error_code = 0; + + if (is_td_vcpu(vcpu)) + return; + + vmx_get_entry_info(vcpu, intr_info, error_code); +} + +static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + if (is_td_vcpu(vcpu)) { + tdx_get_exit_info(vcpu, reason, info1, info2, intr_info, + error_code); + return; + } + + vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); +} + +static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_cr8_intercept(vcpu, tpr, irr); +} + +static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_apic_access_page_addr(vcpu); +} + +static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm); + return; + } + + vmx_refresh_apicv_exec_ctrl(vcpu); +} + +static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + +static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_tss_addr(kvm, addr); +} + +static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_identity_map_addr(kvm, ident_addr); +} + +static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_offset(vcpu); +} + +static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_multiplier(vcpu); +} + +static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc offset can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_offset(vcpu); +} + +static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc multiplier can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_multiplier(vcpu); +} + +#ifdef CONFIG_X86_64 +static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + /* VMX-preemption timer isn't available for TDX. */ + if (is_td_vcpu(vcpu)) + return -EINVAL; + + return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); +} + +static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_hv_timer(vcpu); +} +#endif + +static void vt_setup_mce(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_setup_mce(vcpu); +} + +static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) +{ + if (!is_td(kvm)) + return -ENOTTY; + + return tdx_vm_ioctl(kvm, argp); +} + +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_ioctl(vcpu, argp); +} + +static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + if (is_td(kvm)) + return tdx_gmem_private_max_mapping_level(kvm, pfn); + + return 0; +} + +#define vt_op(name) vt_##name +#define vt_op_tdx_only(name) vt_##name +#else /* CONFIG_KVM_INTEL_TDX */ +#define vt_op(name) vmx_##name +#define vt_op_tdx_only(name) NULL +#endif /* CONFIG_KVM_INTEL_TDX */ #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -24,147 +863,210 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, .enable_virtualization_cpu = vmx_enable_virtualization_cpu, - .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, - .has_emulated_msr = vmx_has_emulated_msr, + .has_emulated_msr = vt_op(has_emulated_msr), .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, + .vm_init = vt_op(vm_init), + .vm_destroy = vt_op(vm_destroy), + .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy), - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, + .vcpu_precreate = vt_op(vcpu_precreate), + .vcpu_create = vt_op(vcpu_create), + .vcpu_free = vt_op(vcpu_free), + .vcpu_reset = vt_op(vcpu_reset), - .update_exception_bitmap = vmx_update_exception_bitmap, + .prepare_switch_to_guest = vt_op(prepare_switch_to_guest), + .vcpu_load = vt_op(vcpu_load), + .vcpu_put = vt_op(vcpu_put), + + .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS, + + .update_exception_bitmap = vt_op(update_exception_bitmap), .get_feature_msr = vmx_get_feature_msr, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cpl_no_cache = vmx_get_cpl_no_cache, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, + .get_msr = vt_op(get_msr), + .set_msr = vt_op(set_msr), + + .get_segment_base = vt_op(get_segment_base), + .get_segment = vt_op(get_segment), + .set_segment = vt_op(set_segment), + .get_cpl = vt_op(get_cpl), + .get_cpl_no_cache = vt_op(get_cpl_no_cache), + .get_cs_db_l_bits = vt_op(get_cs_db_l_bits), + .is_valid_cr0 = vt_op(is_valid_cr0), + .set_cr0 = vt_op(set_cr0), + .is_valid_cr4 = vt_op(is_valid_cr4), + .set_cr4 = vt_op(set_cr4), + .set_efer = vt_op(set_efer), + .get_idt = vt_op(get_idt), + .set_idt = vt_op(set_idt), + .get_gdt = vt_op(get_gdt), + .set_gdt = vt_op(set_gdt), + .set_dr7 = vt_op(set_dr7), + .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs), + .cache_reg = vt_op(cache_reg), + .get_rflags = vt_op(get_rflags), + .set_rflags = vt_op(set_rflags), + .get_if_flag = vt_op(get_if_flag), + + .flush_tlb_all = vt_op(flush_tlb_all), + .flush_tlb_current = vt_op(flush_tlb_current), + .flush_tlb_gva = vt_op(flush_tlb_gva), + .flush_tlb_guest = vt_op(flush_tlb_guest), + + .vcpu_pre_run = vt_op(vcpu_pre_run), + .vcpu_run = vt_op(vcpu_run), + .handle_exit = vt_op(handle_exit), .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, + .set_interrupt_shadow = vt_op(set_interrupt_shadow), + .get_interrupt_shadow = vt_op(get_interrupt_shadow), + .patch_hypercall = vt_op(patch_hypercall), + .inject_irq = vt_op(inject_irq), + .inject_nmi = vt_op(inject_nmi), + .inject_exception = vt_op(inject_exception), + .cancel_injection = vt_op(cancel_injection), + .interrupt_allowed = vt_op(interrupt_allowed), + .nmi_allowed = vt_op(nmi_allowed), + .get_nmi_mask = vt_op(get_nmi_mask), + .set_nmi_mask = vt_op(set_nmi_mask), + .enable_nmi_window = vt_op(enable_nmi_window), + .enable_irq_window = vt_op(enable_irq_window), + .update_cr8_intercept = vt_op(update_cr8_intercept), .x2apic_icr_is_split = false, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .set_virtual_apic_mode = vt_op(set_virtual_apic_mode), + .set_apic_access_page_addr = vt_op(set_apic_access_page_addr), + .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl), + .load_eoi_exitmap = vt_op(load_eoi_exitmap), + .apicv_pre_state_restore = pi_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_isr_update = vmx_hwapic_isr_update, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, + .hwapic_isr_update = vt_op(hwapic_isr_update), + .sync_pir_to_irr = vt_op(sync_pir_to_irr), + .deliver_interrupt = vt_op(deliver_interrupt), .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, + .set_tss_addr = vt_op(set_tss_addr), + .set_identity_map_addr = vt_op(set_identity_map_addr), .get_mt_mask = vmx_get_mt_mask, - .get_exit_info = vmx_get_exit_info, - .get_entry_info = vmx_get_entry_info, + .get_exit_info = vt_op(get_exit_info), + .get_entry_info = vt_op(get_entry_info), - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid), .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, + .get_l2_tsc_offset = vt_op(get_l2_tsc_offset), + .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier), + .write_tsc_offset = vt_op(write_tsc_offset), + .write_tsc_multiplier = vt_op(write_tsc_multiplier), - .load_mmu_pgd = vmx_load_mmu_pgd, + .load_mmu_pgd = vt_op(load_mmu_pgd), .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .cpu_dirty_log_size = PML_LOG_NR_ENTRIES, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging), .nested_ops = &vmx_nested_ops, .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, + .pi_start_bypass = vmx_pi_start_bypass, #ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, + .set_hv_timer = vt_op(set_hv_timer), + .cancel_hv_timer = vt_op(cancel_hv_timer), #endif - .setup_mce = vmx_setup_mce, + .setup_mce = vt_op(setup_mce), #ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, + .smi_allowed = vt_op(smi_allowed), + .enter_smm = vt_op(enter_smm), + .leave_smm = vt_op(leave_smm), + .enable_smi_window = vt_op(enable_smi_window), #endif - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .check_emulate_instruction = vt_op(check_emulate_instruction), + .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, + .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), + .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, .get_untagged_addr = vmx_get_untagged_addr, + + .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), + .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + + .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { - .hardware_setup = vmx_hardware_setup, + .hardware_setup = vt_op(hardware_setup), .handle_intel_pt_intr = NULL, .runtime_ops = &vt_x86_ops, .pmu_ops = &intel_pmu_ops, }; + +static void __exit vt_exit(void) +{ + kvm_exit(); + tdx_cleanup(); + vmx_exit(); +} +module_exit(vt_exit); + +static int __init vt_init(void) +{ + unsigned vcpu_size, vcpu_align; + int r; + + r = vmx_init(); + if (r) + return r; + + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + + /* + * TDX and VMX have different vCPU structures. Calculate the + * maximum size/align so that kvm_init() can use the larger + * values to create the kmem_vcpu_cache. + */ + vcpu_size = sizeof(struct vcpu_vmx); + vcpu_align = __alignof__(struct vcpu_vmx); + if (enable_tdx) { + vcpu_size = max_t(unsigned, vcpu_size, + sizeof(struct vcpu_tdx)); + vcpu_align = max_t(unsigned, vcpu_align, + __alignof__(struct vcpu_tdx)); + kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); + } + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: + vmx_exit(); + return r; +} +module_init(vt_init); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d268224227f0..b8ea1969113d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -276,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -302,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -426,7 +426,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ - WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report @@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_APERF, MSR_TYPE_R); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_MPERF, MSR_TYPE_R); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -825,12 +831,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; + /* + * Exceeding the limit results in architecturally _undefined_ behavior, + * i.e. KVM is allowed to do literally anything in response to a bad + * limit. Immediately generate a consistency check so that code that + * consumes the count doesn't need to worry about extreme edge cases. + */ + if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) + return -EINVAL; + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; @@ -941,15 +965,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } -static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; -} - /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. @@ -966,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), @@ -1054,7 +1069,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) @@ -2654,10 +2669,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -3147,7 +3163,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && - CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || + CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && @@ -3521,7 +3538,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -4521,12 +4538,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } @@ -4599,6 +4616,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + /* + * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. + * Writes to DEBUGCTL that aren't intercepted by L1 are immediately + * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into + * vmcs02 doesn't strictly track vmcs12. + */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) vmcs12->guest_dr7 = vcpu->arch.dr7; @@ -4623,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -4789,13 +4812,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmx_guest_debugctl_write(vcpu, 0); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4844,6 +4867,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); } + /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ + vmx_reload_guest_debugctl(vcpu); + /* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model. @@ -5021,16 +5047,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * If IBRS is advertised to the vCPU, KVM must flush the indirect - * branch predictors when transitioning from L2 to L1, as L1 expects - * hardware (KVM in this case) to provide separate predictor modes. - * Bare metal isolates VMX root (host) from VMX non-root (guest), but - * doesn't isolate different VMCSs, i.e. in this case, doesn't provide - * separate modes for L2 vs L1. - */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) - indirect_branch_prediction_barrier(); + kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -6128,7 +6145,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -6573,7 +6590,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 231a9633359c..0b173602821b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -20,6 +20,7 @@ #include "lapic.h" #include "nested.h" #include "pmu.h" +#include "tdx.h" /* * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX @@ -35,6 +36,24 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc; +} + +static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc.records; +} + +#pragma GCC poison to_vmx + static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; @@ -130,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return cpuid_model_is_consistent(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return !!vcpu_to_lbr_records(vcpu)->nr; +} + static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) { struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); @@ -195,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) { struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + if (lbr_desc->event) { perf_event_release_kernel(lbr_desc->event); lbr_desc->event = NULL; @@ -236,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) PERF_SAMPLE_BRANCH_USER, }; + if (WARN_ON_ONCE(!lbr_desc)) + return 0; + if (unlikely(lbr_desc->event)) { __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; @@ -467,6 +508,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) u64 perf_capabilities; u64 counter_rsvd; + if (!lbr_desc) + return; + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); /* @@ -543,7 +587,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (cpuid_model_is_consistent(vcpu) && + if (intel_pmu_lbr_is_compatible(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else @@ -571,6 +615,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; @@ -606,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) */ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) { - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + u64 data = vmx_guest_debugctl_read(); if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { data &= ~DEBUGCTLMSR_LBR; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); } } @@ -678,9 +725,12 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (WARN_ON_ONCE(!lbr_desc)) + return; + if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) goto warn; if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) goto warn; @@ -702,7 +752,7 @@ warn: static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) { - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) intel_pmu_release_guest_lbr_event(vcpu); } diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h new file mode 100644 index 000000000000..5620d0882cdc --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_PMU_INTEL_H +#define __KVM_X86_VMX_PMU_INTEL_H + +#include <linux/kvm_host.h> + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + +extern struct x86_pmu_lbr vmx_lbr_caps; + +#endif /* __KVM_X86_VMX_PMU_INTEL_H */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index d70e5b90087d..4a6d9a17da23 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> +#include <linux/kvm_irqfd.h> #include <asm/irq_remapping.h> #include <asm/cpu.h> @@ -11,6 +12,7 @@ #include "posted_intr.h" #include "trace.h" #include "vmx.h" +#include "tdx.h" /* * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() @@ -33,9 +35,9 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { - return &(to_vmx(vcpu)->pi_desc); + return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) @@ -55,7 +57,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -71,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) /* * If the vCPU wasn't on the wakeup list and wasn't migrated, then the * full update can be skipped as neither the vector nor the destination - * needs to be changed. + * needs to be changed. Clear SN even if there is no assigned device, + * again for simplicity. */ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { - /* - * Clear SN if it was set due to being preempted. Again, do - * this even if there is no assigned device for simplicity. - */ if (pi_test_and_clear_sn(pi_desc)) goto after_clear_sn; return; @@ -102,7 +101,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ raw_spin_lock(spinlock); spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); - list_del(&vmx->pi_wakeup_list); + list_del(&vt->pi_wakeup_list); spin_release(&spinlock->dep_map, _RET_IP_); raw_spin_unlock(spinlock); } @@ -147,9 +146,13 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - return irqchip_in_kernel(kvm) && enable_apicv && - kvm_arch_has_assigned_device(kvm) && - irq_remapping_cap(IRQ_POSTING_CAP); + /* + * Note, reading the number of possible bypass IRQs can race with a + * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures + * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK. + */ + return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && + READ_ONCE(kvm->arch.nr_possible_bypass_irqs); } /* @@ -159,7 +162,7 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; lockdep_assert_irqs_disabled(); @@ -178,7 +181,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), PI_LOCK_SCHED_OUT); - list_add_tail(&vmx->pi_wakeup_list, + list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -213,7 +216,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) * notification vector is switched to the one that calls * back to the pi_wakeup_handler() function. */ - return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); + return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || + vmx_can_use_vtd_pi(vcpu->kvm); } void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -223,15 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) if (!vmx_needs_pi_wakeup(vcpu)) return; - if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) - pi_enable_wakeup_handler(vcpu); - /* - * Set SN when the vCPU is preempted. Note, the vCPU can both be seen - * as blocking and preempted, e.g. if it's preempted between setting - * its wait state and manually scheduling out. + * If the vCPU is blocking with IRQs enabled and ISN'T being preempted, + * enable the wakeup handler so that notification IRQ wakes the vCPU as + * expected. There is no need to enable the wakeup handler if the vCPU + * is preempted between setting its wait state and manually scheduling + * out, as the task is still runnable, i.e. doesn't need a wake event + * from KVM to be scheduled in. + * + * If the wakeup handler isn't being enabled, Suppress Notifications as + * the cost of propagating PIR.IRR to PID.ON is negligible compared to + * the cost of a spurious IRQ, and vCPU put/load is a slow path. */ - if (vcpu->preempted) + if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && + ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || + (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) + pi_enable_wakeup_handler(vcpu); + else pi_set_sn(pi_desc); } @@ -243,13 +255,13 @@ void pi_wakeup_handler(void) int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); - struct vcpu_vmx *vmx; + struct vcpu_vt *vt; raw_spin_lock(spinlock); - list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { - if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_wake_up(&vmx->vcpu); + if (pi_test_on(&vt->pi_desc)) + kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } @@ -260,6 +272,14 @@ void __init pi_init_cpu(int cpu) raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi = vcpu_to_pi_desc(vcpu); + + pi_clear_on(pi); + memset(pi->pir, 0, sizeof(pi->pir)); +} + bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -270,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) /* - * Bail out of the block loop if the VM has an assigned - * device, but the blocking vCPU didn't reconfigure the - * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in vmx_vcpu_pi_put(). + * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as + * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup + * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put(). */ -void vmx_pi_start_assignment(struct kvm *kvm) +void vmx_pi_start_bypass(struct kvm *kvm) { - if (!irq_remapping_cap(IRQ_POSTING_CAP)) + if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm))) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); } -/* - * vmx_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector) { - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - bool enable_remapped_mode = true; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!vmx_can_use_vtd_pi(kvm)) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; + if (vcpu) { + struct intel_iommu_pi_data pi_data = { + .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), + .vector = vector, + }; + + return irq_set_vcpu_affinity(host_irq, &pi_data); + } else { + return irq_set_vcpu_affinity(host_irq, NULL); } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) - continue; - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (!set) - continue; - - enable_remapped_mode = false; - - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - if (enable_remapped_mode) - ret = irq_set_vcpu_affinity(host_irq, NULL); - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; } diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index ad9116a99bcc..a4af39948cf0 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -3,22 +3,27 @@ #define __KVM_X86_VMX_POSTED_INTR_H #include <linux/bitmap.h> +#include <linux/find.h> +#include <linux/kvm_host.h> + #include <asm/posted_intr.h> void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); -void vmx_pi_start_assignment(struct kvm *kvm); +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_vcpu *vcpu, u32 vector); +void vmx_pi_start_bypass(struct kvm *kvm); static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; - vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index 6a9bfdfbb6e5..2f20fb170def 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,10 +2,12 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME_SHIFT 0 -#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 +#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2 -#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) -#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) +#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) +#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..66744f5768c8 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,3643 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <asm/cpufeature.h> +#include <asm/fpu/xcr.h> +#include <linux/misc_cgroup.h> +#include <linux/mmu_context.h> +#include <asm/tdx.h> +#include "capabilities.h" +#include "mmu.h" +#include "x86_ops.h" +#include "lapic.h" +#include "tdx.h" +#include "vmx.h" +#include "mmu/spte.h" +#include "common.h" +#include "posted_intr.h" +#include "irq.h" +#include <trace/events/kvm.h> +#include "trace.h" + +#pragma GCC poison to_vmx + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define pr_tdx_error(__fn, __err) \ + pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) + +#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ + pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) + +#define pr_tdx_error_1(__fn, __err, __rcx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) + +#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) + +#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) + +bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + +static enum cpuhp_state tdx_cpuhp_state; + +static const struct tdx_sys_info *tdx_sysinfo; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + +#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) + +static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_tdx, kvm); +} + +static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_tdx, vcpu); +} + +static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = KVM_SUPPORTED_TD_ATTRS; + + if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) + return 0; + + val &= td_conf->attributes_fixed0; + + return val; +} + +static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; + + if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) + return 0; + + val &= td_conf->xfam_fixed0; + + return val; +} + +static int tdx_get_guest_phys_addr_bits(const u32 eax) +{ + return (eax & GENMASK(23, 16)) >> 16; +} + +static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) +{ + return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; +} + +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) + +static bool has_tsx(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ebx & TDX_FEATURE_TSX); +} + +static void clear_tsx(struct kvm_cpuid_entry2 *entry) +{ + entry->ebx &= ~TDX_FEATURE_TSX; +} + +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); +} + +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) +{ + entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); +} + +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) +{ + if (has_tsx(entry)) + clear_tsx(entry); + + if (has_waitpkg(entry)) + clear_waitpkg(entry); +} + +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) +{ + return has_tsx(entry) || has_waitpkg(entry); +} + +#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) + +static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + + entry->function = (u32)td_conf->cpuid_config_leaves[idx]; + entry->index = td_conf->cpuid_config_leaves[idx] >> 32; + entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; + entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; + entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; + entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; + + if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) + entry->index = 0; + + /* + * The TDX module doesn't allow configuring the guest phys addr bits + * (EAX[23:16]). However, KVM uses it as an interface to the userspace + * to configure the GPAW. Report these bits as configurable. + */ + if (entry->function == 0x80000008) + entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); + + tdx_clear_unsupported_cpuid(entry); +} + +#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1) + +static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, + struct kvm_tdx_capabilities *caps) +{ + int i; + + caps->supported_attrs = tdx_get_supported_attrs(td_conf); + if (!caps->supported_attrs) + return -EIO; + + caps->supported_xfam = tdx_get_supported_xfam(td_conf); + if (!caps->supported_xfam) + return -EIO; + + caps->cpuid.nent = td_conf->num_cpuid_config; + + caps->user_tdvmcallinfo_1_r11 = + TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT; + + for (i = 0; i < td_conf->num_cpuid_config; i++) + td_init_cpuid_entry2(&caps->cpuid.entries[i], i); + + return 0; +} + +/* + * Some SEAMCALLs acquire the TDX module globally, and can fail with + * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. + */ +static DEFINE_MUTEX(tdx_lock); + +static atomic_t nr_configured_hkid; + +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + +/* + * A per-CPU list of TD vCPUs associated with a given CPU. + * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU + * list. + * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of + * the old CPU during the IPI callback running on the old CPU, and then added + * to the per-CPU list of the new CPU. + * - When a TD is tearing down, all vCPUs are disassociated from their current + * running CPUs and removed from the per-CPU list during the IPI callback + * running on those CPUs. + * - When a CPU is brought down, traverse the per-CPU list to disassociate all + * associated TD vCPUs and remove them from the per-CPU list. + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + +static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r10; +} + +static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r11; +} + +static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, + long val) +{ + to_tdx(vcpu)->vp_enter_args.r10 = val; +} + +static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, + unsigned long val) +{ + to_tdx(vcpu)->vp_enter_args.r11 = val; +} + +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) +{ + tdx_guest_keyid_free(kvm_tdx->hkid); + kvm_tdx->hkid = -1; + atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; +} + +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) +{ + return kvm_tdx->hkid > 0; +} + +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) +{ + lockdep_assert_irqs_disabled(); + + list_del(&to_tdx(vcpu)->cpu_list); + + /* + * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, + * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU + * to its list before it's deleted from this CPU's list. + */ + smp_wmb(); + + vcpu->cpu = -1; +} + +static void tdx_clear_page(struct page *page) +{ + const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); + void *dest = page_to_virt(page); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(dest + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +static void tdx_no_vcpus_enter_start(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +static void tdx_no_vcpus_enter_stop(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); +} + +/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ +static int __tdx_reclaim_page(struct page *page) +{ + u64 err, rcx, rdx, r8; + + err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); + + /* + * No need to check for TDX_OPERAND_BUSY; all TD pages are freed + * before the HKID is released and control pages have also been + * released at this point, so there is no possibility of contention. + */ + if (WARN_ON_ONCE(err)) { + pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + return -EIO; + } + return 0; +} + +static int tdx_reclaim_page(struct page *page) +{ + int r; + + r = __tdx_reclaim_page(page); + if (!r) + tdx_clear_page(page); + return r; +} + + +/* + * Reclaim the TD control page(s) which are crypto-protected by TDX guest's + * private KeyID. Assume the cache associated with the TDX private KeyID has + * been flushed. + */ +static void tdx_reclaim_control_page(struct page *ctrl_page) +{ + /* + * Leak the page if the kernel failed to reclaim the page. + * The kernel cannot use it safely anymore. + */ + if (tdx_reclaim_page(ctrl_page)) + return; + + __free_page(ctrl_page); +} + +struct tdx_flush_vp_arg { + struct kvm_vcpu *vcpu; + u64 err; +}; + +static void tdx_flush_vp(void *_arg) +{ + struct tdx_flush_vp_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + u64 err; + + arg->err = 0; + lockdep_assert_irqs_disabled(); + + /* Task migration can race with CPU offlining. */ + if (unlikely(vcpu->cpu != raw_smp_processor_id())) + return; + + /* + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The + * list tracking still needs to be updated so that it's correct if/when + * the vCPU does get initialized. + */ + if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { + /* + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This + * vp flush function is called when destructing vCPU/TD or vCPU + * migration. No other thread uses TDVPR in those cases. + */ + err = tdh_vp_flush(&to_tdx(vcpu)->vp); + if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { + /* + * This function is called in IPI context. Do not use + * printk to avoid console semaphore. + * The caller prints out the error message, instead. + */ + if (err) + arg->err = err; + } + } + + tdx_disassociate_vp(vcpu); +} + +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) +{ + struct tdx_flush_vp_arg arg = { + .vcpu = vcpu, + }; + int cpu = vcpu->cpu; + + if (unlikely(cpu == -1)) + return; + + smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); + if (KVM_BUG_ON(arg.err, vcpu->kvm)) + pr_tdx_error(TDH_VP_FLUSH, arg.err); +} + +void tdx_disable_virtualization_cpu(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct tdx_flush_vp_arg arg; + struct vcpu_tdx *tdx, *tmp; + unsigned long flags; + + local_irq_save(flags); + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { + arg.vcpu = &tdx->vcpu; + tdx_flush_vp(&arg); + } + local_irq_restore(flags); +} + +#define TDX_SEAMCALL_RETRIES 10000 + +static void smp_func_do_phymem_cache_wb(void *unused) +{ + u64 err = 0; + bool resume; + int i; + + /* + * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private + * KeyID on the package or core. The TDX module may not finish the + * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The + * kernel should retry it until it returns success w/o rescheduling. + */ + for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { + resume = !!err; + err = tdh_phymem_cache_wb(resume); + switch (err) { + case TDX_INTERRUPTED_RESUMABLE: + continue; + case TDX_NO_HKID_READY_TO_WBCACHE: + err = TDX_SUCCESS; /* Already done by other thread */ + fallthrough; + default: + goto out; + } + } + +out: + if (WARN_ON_ONCE(err)) + pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); +} + +void tdx_mmu_release_hkid(struct kvm *kvm) +{ + bool packages_allocated, targets_allocated; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages, targets; + struct kvm_vcpu *vcpu; + unsigned long j; + int i; + u64 err; + + if (!is_hkid_assigned(kvm_tdx)) + return; + + packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); + targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); + cpus_read_lock(); + + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + + /* + * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock + * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. + * Multiple TDX guests can be destroyed simultaneously. Take the + * mutex to prevent it from getting error. + */ + mutex_lock(&tdx_lock); + + /* + * Releasing HKID is in vm_destroy(). + * After the above flushing vps, there should be no more vCPU + * associations, as all vCPU fds have been released at this stage. + */ + err = tdh_mng_vpflushdone(&kvm_tdx->td); + if (err == TDX_FLUSHVP_NOT_DONE) + goto out; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + goto out; + } + + for_each_online_cpu(i) { + if (packages_allocated && + cpumask_test_and_set_cpu(topology_physical_package_id(i), + packages)) + continue; + if (targets_allocated) + cpumask_set_cpu(i, targets); + } + if (targets_allocated) + on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); + else + on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); + /* + * In the case of error in smp_func_do_phymem_cache_wb(), the following + * tdh_mng_key_freeid() will fail. + */ + err = tdh_mng_key_freeid(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_KEY_FREEID, err); + pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + } else { + tdx_hkid_free(kvm_tdx); + } + +out: + mutex_unlock(&tdx_lock); + cpus_read_unlock(); + free_cpumask_var(targets); + free_cpumask_var(packages); +} + +static void tdx_reclaim_td_control_pages(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + int i; + + /* + * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong + * heavily with TDX module. Give up freeing TD pages. As the function + * already warned, don't warn it again. + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (kvm_tdx->td.tdcs_pages) { + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (!kvm_tdx->td.tdcs_pages[i]) + continue; + + tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); + } + kfree(kvm_tdx->td.tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + } + + if (!kvm_tdx->td.tdr_page) + return; + + if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) + return; + + /* + * Use a SEAMCALL to ask the TDX module to flush the cache based on the + * KeyID. TDX module may access TDR while operating on TD (Especially + * when it is reclaiming TDCS). + */ + err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return; + } + tdx_clear_page(kvm_tdx->td.tdr_page); + + __free_page(kvm_tdx->td.tdr_page); + kvm_tdx->td.tdr_page = NULL; +} + +void tdx_vm_destroy(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + tdx_reclaim_td_control_pages(kvm); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; +} + +static int tdx_do_tdh_mng_key_config(void *param) +{ + struct kvm_tdx *kvm_tdx = param; + u64 err; + + /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ + err = tdh_mng_key_config(&kvm_tdx->td); + + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { + pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + return -EIO; + } + + return 0; +} + +int tdx_vm_init(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + kvm->arch.has_protected_state = true; + kvm->arch.has_private_mem = true; + kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports + * such limit via the MAX_VCPU_PER_TD global metadata. In + * practice, it reflects the number of logical CPUs that ALL + * platforms that the TDX module supports can possibly have. + * + * Limit TDX guest's maximum vCPUs to the number of logical CPUs + * the platform has. Simply forwarding the MAX_VCPU_PER_TD to + * userspace would result in an unpredictable ABI. + */ + kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; + + return 0; +} + +int tdx_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (kvm_tdx->state != TD_STATE_INITIALIZED) + return -EIO; + + /* + * TDX module mandates APICv, which requires an in-kernel local APIC. + * Disallow an in-kernel I/O APIC, because level-triggered interrupts + * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. + */ + if (!irqchip_split(vcpu->kvm)) + return -EINVAL; + + fpstate_set_confidential(&vcpu->arch.guest_fpu); + vcpu->arch.apic->guest_apic_protected = true; + INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); + + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; + + vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; + vcpu->arch.cr0_guest_owned_bits = -1ul; + vcpu->arch.cr4_guest_owned_bits = -1ul; + + /* KVM can't change TSC offset/multiplier as TDX module manages them. */ + vcpu->arch.guest_tsc_protected = true; + vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; + vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + + vcpu->arch.guest_state_protected = + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); + + if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) + vcpu->arch.xfd_no_write_intercept = true; + + tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&tdx->vt.pi_desc); + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; + + return 0; +} + +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + vmx_vcpu_pi_load(vcpu, cpu); + if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) + return; + + tdx_flush_vp_on_cpu(vcpu); + + KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); +} + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get the interrupt status of TDX guest and it assumes + * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, + * which passes the interrupt blocked flag. + */ + return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + !to_tdx(vcpu)->vp_enter_args.r12; +} + +static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + u64 vcpu_state_details; + + if (pi_has_pending_interrupt(vcpu)) + return true; + + /* + * Only check RVI pending for HALTED case with IRQ enabled. + * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the + * interrupt was pending before TD exit, then it _must_ be blocked, + * otherwise the interrupt would have been serviced at the instruction + * boundary. + */ + if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + to_tdx(vcpu)->vp_enter_args.r12) + return false; + + vcpu_state_details = + td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); + + return tdx_vcpu_state_details_intr_pending(vcpu_state_details); +} + +/* + * Compared to vmx_prepare_switch_to_guest(), there is not much to do + * as SEAMCALL/SEAMRET calls take care of most of save and restore. + */ +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->guest_state_loaded = true; +} + +struct tdx_uret_msr { + u32 msr; + unsigned int slot; + u64 defval; +}; + +static struct tdx_uret_msr tdx_uret_msrs[] = { + {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, + {.msr = MSR_STAR,}, + {.msr = MSR_LSTAR,}, + {.msr = MSR_TSC_AUX,}, +}; + +static void tdx_user_return_msr_update_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) + kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval); +} + +static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (!vt->guest_state_loaded) + return; + + ++vcpu->stat.host_state_reload; + wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + + if (tdx->guest_entered) { + tdx_user_return_msr_update_cache(); + tdx->guest_entered = false; + } + + vt->guest_state_loaded = false; +} + +void tdx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + tdx_prepare_switch_to_host(vcpu); +} + +void tdx_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + int i; + + /* + * It is not possible to reclaim pages while hkid is assigned. It might + * be assigned if: + * 1. the TD VM is being destroyed but freeing hkid failed, in which + * case the pages are leaked + * 2. TD VCPU creation failed and this on the error path, in which case + * there is nothing to do anyway + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (tdx->vp.tdcx_pages) { + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + } + if (tdx->vp.tdvpr_page) { + tdx_reclaim_control_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + } + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; +} + +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || + to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) + return -EINVAL; + + return 1; +} + +static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case EXIT_REASON_CPUID: + case EXIT_REASON_HLT: + case EXIT_REASON_IO_INSTRUCTION: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return tdvmcall_leaf(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return EXIT_REASON_EPT_MISCONFIG; + default: + break; + } + + return EXIT_REASON_TDCALL; +} + +static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u32 exit_reason; + + switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { + case TDX_SUCCESS: + case TDX_NON_RECOVERABLE_VCPU: + case TDX_NON_RECOVERABLE_TD: + case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: + case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: + break; + default: + return -1u; + } + + exit_reason = tdx->vp_enter_ret; + + switch (exit_reason) { + case EXIT_REASON_TDCALL: + if (tdvmcall_exit_type(vcpu)) + return EXIT_REASON_VMCALL; + + return tdcall_to_vmx_exit_reason(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + /* + * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in + * non-instrumentable code with interrupts disabled. + */ + return -1u; + default: + break; + } + + return exit_reason; +} + +static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + guest_state_enter_irqoff(); + + tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); + + vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); + + vt->exit_qualification = tdx->vp_enter_args.rcx; + tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; + tdx->exit_gpa = tdx->vp_enter_args.r8; + vt->exit_intr_info = tdx->vp_enter_args.r9; + + vmx_handle_nmi(vcpu); + + guest_state_exit_irqoff(); +} + +static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) +{ + return vmx_get_exit_reason(vcpu).failed_vmentry && + vmx_get_exit_reason(vcpu).full != -1u; +} + +static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; + + /* + * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation + * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. + * + * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both + * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target + * vCPUs leaving fastpath so that interrupt can be enabled to ensure the + * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of + * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the + * requester may be blocked endlessly. + */ + if (unlikely(tdx_operand_busy(vp_enter_ret))) + return EXIT_FASTPATH_EXIT_HANDLED; + + return EXIT_FASTPATH_NONE; +} + +#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ + BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ + BIT_ULL(VCPU_REGS_RAX) | \ + BIT_ULL(VCPU_REGS_RBX) | \ + BIT_ULL(VCPU_REGS_RCX) | \ + BIT_ULL(VCPU_REGS_RDX) | \ + BIT_ULL(VCPU_REGS_RBP) | \ + BIT_ULL(VCPU_REGS_RSI) | \ + BIT_ULL(VCPU_REGS_RDI) | \ + BIT_ULL(VCPU_REGS_R8) | \ + BIT_ULL(VCPU_REGS_R9) | \ + BIT_ULL(VCPU_REGS_R10) | \ + BIT_ULL(VCPU_REGS_R11) | \ + BIT_ULL(VCPU_REGS_R12) | \ + BIT_ULL(VCPU_REGS_R13) | \ + BIT_ULL(VCPU_REGS_R14) | \ + BIT_ULL(VCPU_REGS_R15)) + +static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + + /* + * All TDX hosts support PKRU; but even if they didn't, + * vcpu->arch.host_pkru would be 0 and the wrpkru would be + * skipped. + */ + if (vcpu->arch.host_pkru != 0) + wrpkru(vcpu->arch.host_pkru); + + if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + /* + * Likewise, even if a TDX hosts didn't support XSS both arms of + * the comparison would be 0 and the wrmsrl would be skipped. + */ + if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); +} + +#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ + DEBUGCTLMSR_FREEZE_IN_SMM) + +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + /* + * WARN if KVM wants to force an immediate exit, as the TDX module does + * not guarantee entry into the guest, i.e. it's possible for KVM to + * _think_ it completed entry to the guest and forced an immediate exit + * without actually having done so. Luckily, KVM never needs to force + * an immediate exit for TDX (KVM can't do direct event injection, so + * just WARN and continue on. + */ + WARN_ON_ONCE(run_flags); + + /* + * Wait until retry of SEPT-zap-related SEAMCALL completes before + * allowing vCPU entry to avoid contention with tdh_vp_enter() and + * TDCALLs. + */ + if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) + return EXIT_FASTPATH_EXIT_HANDLED; + + trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT); + + if (pi_test_on(&vt->pi_desc)) { + apic->send_IPI_self(POSTED_INTR_VECTOR); + + if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & + APIC_VECTOR_MASK, &vt->pi_desc)) + kvm_wait_lapic_expire(vcpu); + } + + tdx_vcpu_enter_exit(vcpu); + + if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED) + update_debugctlmsr(vcpu->arch.host_debugctl); + + tdx_load_host_xsave_state(vcpu); + tdx->guest_entered = true; + + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + + if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) + return EXIT_FASTPATH_NONE; + + if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) + return EXIT_FASTPATH_NONE; + + if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + if (unlikely(tdx_failed_vmentry(vcpu))) + return EXIT_FASTPATH_NONE; + + return tdx_exit_handlers_fastpath(vcpu); +} + +void tdx_inject_nmi(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.nmi_injections; + td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); + /* + * From KVM's perspective, NMI injection is completed right after + * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by + * the TDX module or not. + */ + vcpu->arch.nmi_injected = false; + /* + * TDX doesn't support KVM to request NMI window exit. If there is + * still a pending vNMI, KVM is not able to inject it along with the + * one pending in TDX module in a back-to-back way. Since the previous + * vNMI is still pending in TDX module, i.e. it has not been delivered + * to TDX guest yet, it's OK to collapse the pending vNMI into the + * previous one. The guest is expected to handle all the NMI sources + * when handling the first vNMI. + */ + vcpu->arch.nmi_pending = 0; +} + +static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) +{ + u32 intr_info = vmx_get_intr_info(vcpu); + + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on + * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). + */ + if (is_nmi(intr_info) || is_machine_check(intr_info)) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; + vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + vcpu->run->ex.error_code = 0; + + return 0; +} + +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); + return 1; +} + +static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) +{ + kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); + kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); + kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); + kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); + kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); + + return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); +} + +/* + * Split into chunks and check interrupt pending between chunks. This allows + * for timely injection of interrupts to prevent issues with guest lockup + * detection. + */ +#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) +static void __tdx_map_gpa(struct vcpu_tdx *tdx); + +static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->run->hypercall.ret) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; + if (tdx->map_gpa_next >= tdx->map_gpa_end) + return 1; + + /* + * Stop processing the remaining part if there is a pending interrupt, + * which could be qualified to deliver. Skip checking pending RVI for + * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). + */ + if (kvm_vcpu_has_events(vcpu)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + __tdx_map_gpa(tdx); + return 0; +} + +static void __tdx_map_gpa(struct vcpu_tdx *tdx) +{ + u64 gpa = tdx->map_gpa_next; + u64 size = tdx->map_gpa_end - tdx->map_gpa_next; + + if (size > TDX_MAP_GPA_MAX_LEN) + size = TDX_MAP_GPA_MAX_LEN; + + tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; + tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + tdx->vcpu.run->hypercall.ret = 0; + tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; + tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? + KVM_MAP_GPA_RANGE_ENCRYPTED : + KVM_MAP_GPA_RANGE_DECRYPTED; + tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; + + tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; +} + +static int tdx_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + u64 ret; + + /* + * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires + * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. + */ + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + goto error; + } + + if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || + !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || + (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { + ret = TDVMCALL_STATUS_INVALID_OPERAND; + goto error; + } + + if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { + ret = TDVMCALL_STATUS_ALIGN_ERROR; + goto error; + } + + tdx->map_gpa_end = gpa + size; + tdx->map_gpa_next = gpa; + + __tdx_map_gpa(tdx); + return 0; + +error: + tdvmcall_set_return_code(vcpu, ret); + tdx->vp_enter_args.r11 = gpa; + return 1; +} + +static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 *regs = vcpu->run->system_event.data; + u64 *module_regs = &tdx->vp_enter_args.r8; + int index = VCPU_REGS_RAX; + + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; + vcpu->run->system_event.ndata = 16; + + /* Dump 16 general-purpose registers to userspace in ascending order. */ + regs[index++] = tdx->vp_enter_ret; + regs[index++] = tdx->vp_enter_args.rcx; + regs[index++] = tdx->vp_enter_args.rdx; + regs[index++] = tdx->vp_enter_args.rbx; + regs[index++] = 0; + regs[index++] = 0; + regs[index++] = tdx->vp_enter_args.rsi; + regs[index] = tdx->vp_enter_args.rdi; + for (index = 0; index < 8; index++) + regs[VCPU_REGS_R8 + index] = module_regs[index]; + + return 0; +} + +static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 eax, ebx, ecx, edx; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* EAX and ECX for cpuid is stored in R12 and R13. */ + eax = tdx->vp_enter_args.r12; + ecx = tdx->vp_enter_args.r13; + + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); + + tdx->vp_enter_args.r12 = eax; + tdx->vp_enter_args.r13 = ebx; + tdx->vp_enter_args.r14 = ecx; + tdx->vp_enter_args.r15 = edx; + + return 1; +} + +static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + +static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + int ret; + + ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + + WARN_ON_ONCE(!ret); + + tdvmcall_set_return_val(vcpu, val); + + return 1; +} + +static int tdx_emulate_io(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + unsigned int port; + u64 size, write; + int ret; + + ++vcpu->stat.io_exits; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + port = tdx->vp_enter_args.r14; + + if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (write) { + val = tdx->vp_enter_args.r15; + ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); + } else { + ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); + } + + if (!ret) + vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : + tdx_complete_pio_in; + else if (!write) + tdvmcall_set_return_val(vcpu, val); + + return ret; +} + +static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) +{ + unsigned long val = 0; + gpa_t gpa; + int size; + + gpa = vcpu->mmio_fragments[0].gpa; + size = vcpu->mmio_fragments[0].len; + + memcpy(&val, vcpu->run->mmio.data, size); + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 1; +} + +static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, + unsigned long val) +{ + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return 0; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); + if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + return 0; +} + +static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) +{ + unsigned long val; + + if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 0; +} + +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + int size, write, r; + unsigned long val; + gpa_t gpa; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + gpa = tdx->vp_enter_args.r14; + val = write ? tdx->vp_enter_args.r15 : 0; + + if (size != 1 && size != 2 && size != 4 && size != 8) + goto error; + if (write != 0 && write != 1) + goto error; + + /* + * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to + * do MMIO emulation for private GPA. + */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) + goto error; + + gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + + if (write) + r = tdx_mmio_write(vcpu, gpa, size, val); + else + r = tdx_mmio_read(vcpu, gpa, size); + if (!r) + /* Kernel completed device emulation. */ + return 1; + + /* Request the device emulation to userspace device model. */ + vcpu->mmio_is_write = write; + if (!write) + vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = size; + vcpu->run->mmio.is_write = write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + if (write) { + memcpy(vcpu->run->mmio.data, &val, size); + } else { + vcpu->mmio_fragments[0].gpa = gpa; + vcpu->mmio_fragments[0].len = size; + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); + } + return 0; + +error: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; +} + +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + +static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + switch (tdx->vp_enter_args.r12) { + case 0: + tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; + tdx->vp_enter_args.r13 = 0; + tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } +} + +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + +static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vector = tdx->vp_enter_args.r12; + + if (vector < 32 || vector > 255) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; + vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.setup_event_notify.vector = vector; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + +static int handle_tdvmcall(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case TDVMCALL_MAP_GPA: + return tdx_map_gpa(vcpu); + case TDVMCALL_REPORT_FATAL_ERROR: + return tdx_report_fatal_error(vcpu); + case TDVMCALL_GET_TD_VM_CALL_INFO: + return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); + case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: + return tdx_setup_event_notify_interrupt(vcpu); + default: + break; + } + + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); + return 1; +} + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +/* + * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the + * callback tdx_gmem_post_populate() then maps pages into private memory. + * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the + * private EPT structures for the page to have been built before, which is + * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that + * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). + * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there + * are no half-initialized shared EPT pages. + */ +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EINVAL; + + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; +} + +static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + /* + * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching + * barrier in tdx_td_finalize(). + */ + smp_rmb(); + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* + * The second retry is expected to succeed after kicking off all + * other vCPUs and prevent them from invoking TDH.VP.ENTER. + */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 1; +} + +/* + * Ensure shared and private EPTs to be flushed on all vCPUs. + * tdh_mem_track() is the only caller that increases TD epoch. An increase in + * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are + * running in guest mode with the value "N - 1". + * + * A successful execution of tdh_mem_track() ensures that vCPUs can only run in + * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch + * being increased to "N + 1". + * + * Kicking off all vCPUs after that further results in no vCPUs can run in guest + * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. + * to increase TD epoch to "N + 2"). + * + * TDX module will flush EPT on the next TD enter and make vCPUs to run in + * guest mode with TD epoch value "N + 1". + * + * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by + * waiting empty IPI handler ack_kick(). + * + * No action is required to the vCPUs being kicked off since the kicking off + * occurs certainly after TD epoch increment and before the next + * tdh_mem_track(). + */ +static void tdx_track(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + + /* If TD isn't finalized, it's before any vcpu running. */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return; + + lockdep_assert_held_write(&kvm->mmu_lock); + + err = tdh_mem_track(&kvm_tdx->td); + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_track(&kvm_tdx->td); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) + pr_tdx_error(TDH_MEM_TRACK, err); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, page); +} + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* TDX supports only posted interrupt. No lapic emulation. */ + __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); +} + +static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) +{ + u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; + u64 eq = vmx_get_exit_qual(vcpu); + + if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) + return false; + + return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); +} + +static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual; + gpa_t gpa = to_tdx(vcpu)->exit_gpa; + bool local_retry = false; + int ret; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + if (tdx_is_sept_violation_unexpected_pending(vcpu)) { + pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", + gpa, vcpu->vcpu_id); + kvm_vm_dead(vcpu->kvm); + return -EIO; + } + /* + * Always treat SEPT violations as write faults. Ignore the + * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. + * TD private pages are always RWX in the SEPT tables, + * i.e. they're always mapped writable. Just as importantly, + * treating SEPT violations as write faults is necessary to + * avoid COW allocations, which will cause TDAUGPAGE failures + * due to aliasing a single HPA to multiple GPAs. + */ + exit_qual = EPT_VIOLATION_ACC_WRITE; + + /* Only private GPA triggers zero-step mitigation */ + local_retry = true; + } else { + exit_qual = vmx_get_exit_qual(vcpu); + /* + * EPT violation due to instruction fetch should never be + * triggered from shared memory in TDX guest. If such EPT + * violation occurs, treat it as broken hardware. + */ + if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) + return -EIO; + } + + trace_kvm_page_fault(vcpu, gpa, exit_qual); + + /* + * To minimize TDH.VP.ENTER invocations, retry locally for private GPA + * mapping in TDX. + * + * KVM may return RET_PF_RETRY for private GPA due to + * - contentions when atomically updating SPTEs of the mirror page table + * - in-progress GFN invalidation or memslot removal. + * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, + * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) + * or certain TDCALLs. + * + * If TDH.VP.ENTER is invoked more times than the threshold set by the + * TDX module before KVM resolves the private GPA mapping, the TDX + * module will activate zero-step mitigation during TDH.VP.ENTER. This + * process acquires an SEPT tree lock in the TDX module, leading to + * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD + * operations on other vCPUs. + * + * Breaking out of local retries for kvm_vcpu_has_events() is for + * interrupt injection. kvm_vcpu_has_events() should not see pending + * events for TDX. Since KVM can't determine if IRQs (or NMIs) are + * blocked by TDs, false positives are inevitable i.e., KVM may re-enter + * the guest even if the IRQ/NMI can't be delivered. + * + * Note: even without breaking out of local retries, zero-step + * mitigation may still occur due to + * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, + * - a single RIP causing EPT violations for more GFNs than the + * threshold count. + * This is safe, as triggering zero-step mitigation only introduces + * contentions to page installation SEAMCALLs on other vCPUs, which will + * handle retries locally in their EPT violation handlers. + */ + while (1) { + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); + + if (ret != RET_PF_RETRY || !local_retry) + break; + + if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) + break; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + ret = -EIO; + break; + } + + cond_resched(); + } + return ret; +} + +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) + tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); + + return 1; +} + + +int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vp_enter_ret = tdx->vp_enter_ret; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); + + if (fastpath != EXIT_FASTPATH_NONE) + return 1; + + if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { + KVM_BUG_ON(1, vcpu->kvm); + return -EIO; + } + + /* + * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and + * TDX_SEAMCALL_VMFAILINVALID. + */ + if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { + KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); + goto unhandled_exit; + } + + if (unlikely(tdx_failed_vmentry(vcpu))) { + /* + * If the guest state is protected, that means off-TD debug is + * not enabled, TDX_NON_RECOVERABLE must be set. + */ + WARN_ON_ONCE(vcpu->arch.guest_state_protected && + !(vp_enter_ret & TDX_NON_RECOVERABLE)); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && + exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { + kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); + goto unhandled_exit; + } + + WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && + (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); + + switch (exit_reason.basic) { + case EXIT_REASON_TRIPLE_FAULT: + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; + case EXIT_REASON_EXCEPTION_NMI: + return tdx_handle_exception_nmi(vcpu); + case EXIT_REASON_EXTERNAL_INTERRUPT: + ++vcpu->stat.irq_exits; + return 1; + case EXIT_REASON_CPUID: + return tdx_emulate_cpuid(vcpu); + case EXIT_REASON_HLT: + return kvm_emulate_halt_noskip(vcpu); + case EXIT_REASON_TDCALL: + return handle_tdvmcall(vcpu); + case EXIT_REASON_VMCALL: + return tdx_emulate_vmcall(vcpu); + case EXIT_REASON_IO_INSTRUCTION: + return tdx_emulate_io(vcpu); + case EXIT_REASON_MSR_READ: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + return kvm_emulate_rdmsr(vcpu); + case EXIT_REASON_MSR_WRITE: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); + kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); + return kvm_emulate_wrmsr(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + return tdx_emulate_mmio(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return tdx_handle_ept_violation(vcpu); + case EXIT_REASON_OTHER_SMI: + /* + * Unlike VMX, SMI in SEAM non-root mode (i.e. when + * TD guest vCPU is running) will cause VM exit to TDX module, + * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered + * and handled by kernel handler right away. + * + * The Other SMI exit can also be caused by the SEAM non-root + * machine check delivered via Machine Check System Management + * Interrupt (MSMI), but it has already been handled by the + * kernel machine check handler, i.e., the memory page has been + * marked as poisoned and it won't be freed to the free list + * when the TDX guest is terminated (the TDX module marks the + * guest as dead and prevent it from further running when + * machine check happens in SEAM non-root). + * + * - A MSMI will not reach here, it's handled as non_recoverable + * case above. + * - If it's not an MSMI, no need to do anything here. + */ + return 1; + default: + break; + } + +unhandled_exit: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vp_enter_ret; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + return 0; +} + +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + *reason = tdx->vt.exit_reason.full; + if (*reason != -1u) { + *info1 = vmx_get_exit_qual(vcpu); + *info2 = tdx->ext_exit_qualification; + *intr_info = vmx_get_intr_info(vcpu); + } else { + *info1 = 0; + *info2 = 0; + *intr_info = 0; + } + + *error_code = 0; +} + +bool tdx_has_emulated_msr(u32 index) +{ + switch (index) { + case MSR_IA32_UCODE_REV: + case MSR_IA32_ARCH_CAPABILITIES: + case MSR_IA32_POWER_CTL: + case MSR_IA32_CR_PAT: + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_TSC_DEADLINE: + case MSR_IA32_MISC_ENABLE: + case MSR_PLATFORM_INFO: + case MSR_MISC_FEATURES_ENABLES: + case MSR_IA32_APICBASE: + case MSR_EFER: + case MSR_IA32_FEAT_CTL: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ + case MSR_KVM_POLL_CONTROL: + return true; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + /* + * x2APIC registers that are virtualized by the CPU can't be + * emulated, KVM doesn't have access to the virtual APIC page. + */ + switch (index) { + case X2APIC_MSR(APIC_TASKPRI): + case X2APIC_MSR(APIC_PROCPRI): + case X2APIC_MSR(APIC_EOI): + case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): + case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): + case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): + return false; + default: + return true; + } + default: + return false; + } +} + +static bool tdx_is_read_only_msr(u32 index) +{ + return index == MSR_IA32_APICBASE || index == MSR_EFER || + index == MSR_IA32_FEAT_CTL; +} + +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_FEAT_CTL: + /* + * MCE and MCA are advertised via cpuid. Guest kernel could + * check if LMCE is enabled or not. + */ + msr->data = FEAT_CTL_LOCKED; + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + msr->data |= FEAT_CTL_LMCE_ENABLED; + return 0; + case MSR_IA32_MCG_EXT_CTL: + if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) + return 1; + msr->data = vcpu->arch.mcg_ext_ctl; + return 0; + default: + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_get_msr_common(vcpu, msr); + } +} + +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_MCG_EXT_CTL: + if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || + (msr->data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = msr->data; + return 0; + default: + if (tdx_is_read_only_msr(msr->index)) + return 1; + + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_set_msr_common(vcpu, msr); + } +} + +static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_tdx_capabilities __user *user_caps; + struct kvm_tdx_capabilities *caps = NULL; + u32 nr_user_entries; + int ret = 0; + + /* flags is reserved for future use */ + if (cmd->flags) + return -EINVAL; + + caps = kzalloc(sizeof(*caps) + + sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, + GFP_KERNEL); + if (!caps) + return -ENOMEM; + + user_caps = u64_to_user_ptr(cmd->data); + if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { + ret = -EFAULT; + goto out; + } + + if (nr_user_entries < td_conf->num_cpuid_config) { + ret = -E2BIG; + goto out; + } + + ret = init_kvm_tdx_caps(td_conf, caps); + if (ret) + goto out; + + if (copy_to_user(user_caps, caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, + caps->cpuid.nent * + sizeof(caps->cpuid.entries[0]))) + ret = -EFAULT; + +out: + /* kfree() accepts NULL. */ + kfree(caps); + return ret; +} + +/* + * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is + * similar to TDX's GPAW. Use this field as the interface for userspace to + * configure the GPAW and EPT level for TDs. + * + * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level + * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always + * supported. Value 52 is only supported when the platform supports 5 level + * EPT. + */ +static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + int guest_pa; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); + if (!entry) + return -EINVAL; + + guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); + + if (guest_pa != 48 && guest_pa != 52) + return -EINVAL; + + if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) + return -EINVAL; + + td_params->eptp_controls = VMX_EPTP_MT_WB; + if (guest_pa == 52) { + td_params->eptp_controls |= VMX_EPTP_PWL_5; + td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; + } else { + td_params->eptp_controls |= VMX_EPTP_PWL_4; + } + + return 0; +} + +static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + const struct kvm_cpuid_entry2 *entry; + struct tdx_cpuid_value *value; + int i, copy_cnt = 0; + + /* + * td_params.cpuid_values: The number and the order of cpuid_value must + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} + * It's assumed that td_params was zeroed. + */ + for (i = 0; i < td_conf->num_cpuid_config; i++) { + struct kvm_cpuid_entry2 tmp; + + td_init_cpuid_entry2(&tmp, i); + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, + tmp.function, tmp.index); + if (!entry) + continue; + + if (tdx_unsupported_cpuid(entry)) + return -EINVAL; + + copy_cnt++; + + value = &td_params->cpuid_values[i]; + value->eax = entry->eax; + value->ebx = entry->ebx; + value->ecx = entry->ecx; + value->edx = entry->edx; + + /* + * TDX module does not accept nonzero bits 16..23 for the + * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). + */ + if (tmp.function == 0x80000008) + value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); + } + + /* + * Rely on the TDX module to reject invalid configuration, but it can't + * check of leafs that don't have a proper slot in td_params->cpuid_values + * to stick then. So fail if there were entries that didn't get copied to + * td_params. + */ + if (copy_cnt != cpuid->nent) + return -EINVAL; + + return 0; +} + +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, + struct kvm_tdx_init_vm *init_vm) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; + int ret; + + if (kvm->created_vcpus) + return -EBUSY; + + if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) + return -EINVAL; + + if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) + return -EINVAL; + + td_params->max_vcpus = kvm->max_vcpus; + td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; + td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; + + td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); + + ret = setup_tdparams_eptp_controls(cpuid, td_params); + if (ret) + return ret; + + ret = setup_tdparams_cpuids(cpuid, td_params); + if (ret) + return ret; + +#define MEMCPY_SAME_SIZE(dst, src) \ + do { \ + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ + memcpy((dst), (src), sizeof(dst)); \ + } while (0) + + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); + + return 0; +} + +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, + u64 *seamcall_err) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages; + struct page **tdcs_pages = NULL; + struct page *tdr_page; + int ret, i; + u64 err, rcx; + + *seamcall_err = 0; + ret = tdx_guest_keyid_alloc(); + if (ret < 0) + return ret; + kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; + + ret = -ENOMEM; + + atomic_inc(&nr_configured_hkid); + + tdr_page = alloc_page(GFP_KERNEL); + if (!tdr_page) + goto free_hkid; + + kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; + /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ + kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; + tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), + GFP_KERNEL | __GFP_ZERO); + if (!tdcs_pages) + goto free_tdr; + + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + tdcs_pages[i] = alloc_page(GFP_KERNEL); + if (!tdcs_pages[i]) + goto free_tdcs; + } + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + goto free_tdcs; + + cpus_read_lock(); + + /* + * Need at least one CPU of the package to be online in order to + * program all packages for host key id. Check it. + */ + for_each_present_cpu(i) + cpumask_set_cpu(topology_physical_package_id(i), packages); + for_each_online_cpu(i) + cpumask_clear_cpu(topology_physical_package_id(i), packages); + if (!cpumask_empty(packages)) { + ret = -EIO; + /* + * Because it's hard for human operator to figure out the + * reason, warn it. + */ +#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" + pr_warn_ratelimited(MSG_ALLPKG); + goto free_packages; + } + + /* + * TDH.MNG.CREATE tries to grab the global TDX module and fails + * with TDX_OPERAND_BUSY when it fails to grab. Take the global + * lock to prevent it from failure. + */ + mutex_lock(&tdx_lock); + kvm_tdx->td.tdr_page = tdr_page; + err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); + mutex_unlock(&tdx_lock); + + if (err == TDX_RND_NO_ENTROPY) { + ret = -EAGAIN; + goto free_packages; + } + + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_CREATE, err); + ret = -EIO; + goto free_packages; + } + + for_each_online_cpu(i) { + int pkg = topology_physical_package_id(i); + + if (cpumask_test_and_set_cpu(pkg, packages)) + continue; + + /* + * Program the memory controller in the package with an + * encryption key associated to a TDX private host key id + * assigned to this TDR. Concurrent operations on same memory + * controller results in TDX_OPERAND_BUSY. No locking needed + * beyond the cpus_read_lock() above as it serializes against + * hotplug and the first online CPU of the package is always + * used. We never have two CPUs in the same socket trying to + * program the key. + */ + ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, + kvm_tdx, true); + if (ret) + break; + } + cpus_read_unlock(); + free_cpumask_var(packages); + if (ret) { + i = 0; + goto teardown; + } + + kvm_tdx->td.tdcs_pages = tdcs_pages; + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); + if (err == TDX_RND_NO_ENTROPY) { + /* Here it's hard to allow userspace to retry. */ + ret = -EAGAIN; + goto teardown; + } + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_ADDCX, err); + ret = -EIO; + goto teardown; + } + } + + err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { + /* + * Because a user gives operands, don't warn. + * Return a hint to the user because it's sometimes hard for the + * user to figure out which operand is invalid. SEAMCALL status + * code includes which operand caused invalid operand error. + */ + *seamcall_err = err; + ret = -EINVAL; + goto teardown; + } else if (WARN_ON_ONCE(err)) { + pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + ret = -EIO; + goto teardown; + } + + return 0; + + /* + * The sequence for freeing resources from a partially initialized TD + * varies based on where in the initialization flow failure occurred. + * Simply use the full teardown and destroy, which naturally play nice + * with partial initialization. + */ +teardown: + /* Only free pages not yet added, so start at 'i' */ + for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) { + __free_page(tdcs_pages[i]); + tdcs_pages[i] = NULL; + } + } + if (!kvm_tdx->td.tdcs_pages) + kfree(tdcs_pages); + + tdx_mmu_release_hkid(kvm); + tdx_reclaim_td_control_pages(kvm); + + return ret; + +free_packages: + cpus_read_unlock(); + free_cpumask_var(packages); + +free_tdcs: + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) + __free_page(tdcs_pages[i]); + } + kfree(tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + +free_tdr: + if (tdr_page) + __free_page(tdr_page); + kvm_tdx->td.tdr_page = 0; + +free_hkid: + tdx_hkid_free(kvm_tdx); + + return ret; +} + +static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, + u64 *data) +{ + u64 err; + + err = tdh_mng_rd(&tdx->td, field_id, data); + + return err; +} + +#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) +#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) + +static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, + bool sub_leaf_set, int *entry_index, + struct kvm_cpuid_entry2 *out) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; + u64 ebx_eax, edx_ecx; + u64 err = 0; + + if (sub_leaf > 0b1111111) + return -EINVAL; + + if (*entry_index >= KVM_MAX_CPUID_ENTRIES) + return -EINVAL; + + if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || + sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) + return -EINVAL; + + /* + * bit 23:17, REVSERVED: reserved, must be 0; + * bit 16, LEAF_31: leaf number bit 31; + * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are + * implicitly 0; + * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; + * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, + * the SUBLEAF_6_0 is all-1. + * sub-leaf bits 31:7 are implicitly 0; + * bit 0, ELEMENT_I: Element index within field; + */ + field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; + field_id |= (leaf & 0x7f) << 9; + if (sub_leaf_set) + field_id |= (sub_leaf & 0x7f) << 1; + else + field_id |= 0x1fe; + + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); + if (err) //TODO check for specific errors + goto err_out; + + out->eax = (u32) ebx_eax; + out->ebx = (u32) (ebx_eax >> 32); + + field_id++; + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); + /* + * It's weird that reading edx_ecx fails while reading ebx_eax + * succeeded. + */ + if (WARN_ON_ONCE(err)) + goto err_out; + + out->ecx = (u32) edx_ecx; + out->edx = (u32) (edx_ecx >> 32); + + out->function = leaf; + out->index = sub_leaf; + out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; + + /* + * Work around missing support on old TDX modules, fetch + * guest maxpa from gfn_direct_bits. + */ + if (leaf == 0x80000008) { + gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + unsigned int g_maxpa = __ffs(gpa_bits) + 1; + + out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); + } + + (*entry_index)++; + + return 0; + +err_out: + out->eax = 0; + out->ebx = 0; + out->ecx = 0; + out->edx = 0; + + return -EIO; +} + +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_vm *init_vm; + struct td_params *td_params = NULL; + int ret; + + BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); + BUILD_BUG_ON(sizeof(struct td_params) != 1024); + + if (kvm_tdx->state != TD_STATE_UNINITIALIZED) + return -EINVAL; + + if (cmd->flags) + return -EINVAL; + + init_vm = kmalloc(sizeof(*init_vm) + + sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!init_vm) + return -ENOMEM; + + if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { + ret = -EFAULT; + goto out; + } + + if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { + ret = -E2BIG; + goto out; + } + + if (copy_from_user(init_vm->cpuid.entries, + u64_to_user_ptr(cmd->data) + sizeof(*init_vm), + flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { + ret = -EFAULT; + goto out; + } + + if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { + ret = -EINVAL; + goto out; + } + + if (init_vm->cpuid.padding) { + ret = -EINVAL; + goto out; + } + + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); + if (!td_params) { + ret = -ENOMEM; + goto out; + } + + ret = setup_tdparams(kvm, td_params, init_vm); + if (ret) + goto out; + + ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); + if (ret) + goto out; + + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); + kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); + kvm_tdx->attributes = td_params->attributes; + kvm_tdx->xfam = td_params->xfam; + + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + + kvm_tdx->state = TD_STATE_INITIALIZED; +out: + /* kfree() accepts NULL. */ + kfree(init_vm); + kfree(td_params); + + return ret; +} + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + /* + * flush_tlb_current() is invoked when the first time for the vcpu to + * run or when root of shared EPT is invalidated. + * KVM only needs to flush shared EPT because the TDX module handles TLB + * invalidation for private EPT in tdh_vp_enter(); + * + * A single context invalidation for shared EPT can be performed here. + * However, this single context invalidation requires the private EPTP + * rather than the shared EPTP to flush shared EPT, as shared EPT uses + * private EPTP as its ASID for TLB invalidation. + * + * To avoid reading back private EPTP, perform a global invalidation for + * shared EPT instead to keep this function simple. + */ + ept_sync_global(); +} + +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * ensure that private EPT will be flushed on the next TD enter. No need + * to call tdx_track() here again even when this callback is a result of + * zapping private EPT. + * + * Due to the lack of the context to determine which EPT has been + * affected by zapping, invoke invept() directly here for both shared + * EPT and private EPT for simplicity, though it's not necessary for + * private EPT. + */ + ept_sync_global(); +} + +static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + guard(mutex)(&kvm->slots_lock); + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + /* + * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue + * TDH.MEM.PAGE.ADD(). + */ + if (atomic64_read(&kvm_tdx->nr_premapped)) + return -EINVAL; + + cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); + if (tdx_operand_busy(cmd->hw_error)) + return -EBUSY; + if (KVM_BUG_ON(cmd->hw_error, kvm)) { + pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + return -EIO; + } + + kvm_tdx->state = TD_STATE_RUNNABLE; + /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ + smp_wmb(); + kvm->arch.pre_fault_allowed = true; + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + return -EFAULT; + + /* + * Userspace should never set hw_error. It is used to fill + * hardware-defined error by the kernel. + */ + if (tdx_cmd.hw_error) + return -EINVAL; + + mutex_lock(&kvm->lock); + + switch (tdx_cmd.id) { + case KVM_TDX_CAPABILITIES: + r = tdx_get_capabilities(&tdx_cmd); + break; + case KVM_TDX_INIT_VM: + r = tdx_td_init(kvm, &tdx_cmd); + break; + case KVM_TDX_FINALIZE_VM: + r = tdx_td_finalize(kvm, &tdx_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct page *page; + int ret, i; + u64 err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + tdx->vp.tdvpr_page = page; + + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), + GFP_KERNEL); + if (!tdx->vp.tdcx_pages) { + ret = -ENOMEM; + goto free_tdvpr; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_tdcx; + } + tdx->vp.tdcx_pages[i] = page; + } + + err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); + if (KVM_BUG_ON(err, vcpu->kvm)) { + ret = -EIO; + pr_tdx_error(TDH_VP_CREATE, err); + goto free_tdcx; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_ADDCX, err); + /* + * Pages already added are reclaimed by the vcpu_free + * method, but the rest are freed here. + */ + for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + return -EIO; + } + } + + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_INIT, err); + return -EIO; + } + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; + +free_tdcx: + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + +free_tdvpr: + if (tdx->vp.tdvpr_page) + __free_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + + return ret; +} + +/* Sometimes reads multipple subleafs. Return how many enties were written. */ +static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, + struct kvm_cpuid_entry2 *output_e) +{ + int sub_leaf = 0; + int ret; + + /* First try without a subleaf */ + ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); + + /* If success, or invalid leaf, just give up */ + if (ret != -EIO) + return ret; + + /* + * If the try without a subleaf failed, try reading subleafs until + * failure. The TDX module only supports 6 bits of subleaf index. + */ + while (1) { + /* Keep reading subleafs until there is a failure. */ + if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) + return !sub_leaf; + + sub_leaf++; + output_e++; + } + + return 0; +} + +static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct kvm_cpuid2 __user *output, *td_cpuid; + int r = 0, i = 0, leaf; + u32 level; + + output = u64_to_user_ptr(cmd->data); + td_cpuid = kzalloc(sizeof(*td_cpuid) + + sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!td_cpuid) + return -ENOMEM; + + if (copy_from_user(td_cpuid, output, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + /* Read max CPUID for normal range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[0].eax; + + for (leaf = 1; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + /* Read max CPUID for extended range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[i - 1].eax; + + for (leaf = 0x80000001; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + if (td_cpuid->nent < i) + r = -E2BIG; + td_cpuid->nent = i; + + if (copy_to_user(output, td_cpuid, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + if (r == -E2BIG) + goto out; + + if (copy_to_user(output->entries, td_cpuid->entries, + td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + r = -EFAULT; + +out: + kfree(td_cpuid); + + return r; +} + +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + u64 apic_base; + struct vcpu_tdx *tdx = to_tdx(vcpu); + int ret; + + if (cmd->flags) + return -EINVAL; + + if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) + return -EINVAL; + + /* + * TDX requires X2APIC, userspace is responsible for configuring guest + * CPUID accordingly. + */ + apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); + if (kvm_apic_set_base(vcpu, apic_base, true)) + return -EINVAL; + + ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); + if (ret) + return ret; + + td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); + td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); + td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); + + tdx->state = VCPU_TD_STATE_INITIALIZED; + + return 0; +} + +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + /* + * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all + * INIT events. + * + * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as + * userspace needs to define the vCPU model before KVM can initialize + * vCPU state, e.g. to enable x2APIC. + */ + WARN_ON_ONCE(init_event); +} + +struct tdx_gmem_post_populate_arg { + struct kvm_vcpu *vcpu; + __u32 flags; +}; + +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_gmem_post_populate_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + gpa_t gpa = gfn_to_gpa(gfn); + u8 level = PG_LEVEL_4K; + struct page *src_page; + int ret, i; + u64 err, entry, level_state; + + /* + * Get the source page if it has been faulted in. Return failure if the + * source page has been swapped out or unmapped in primary memory. + */ + ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); + if (ret < 0) + return ret; + if (ret != 1) + return -ENOMEM; + + ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + if (ret < 0) + goto out; + + /* + * The private mem cannot be zapped after kvm_tdp_map_page() + * because all paths are covered by slots_lock and the + * filemap invalidate lock. Check that they are indeed enough. + */ + if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + scoped_guard(read_lock, &kvm->mmu_lock) { + if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { + ret = -EIO; + goto out; + } + } + } + + ret = 0; + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + src_page, &entry, &level_state); + if (err) { + ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; + goto out; + } + + if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + atomic64_dec(&kvm_tdx->nr_premapped); + + if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, + &level_state); + if (err) { + ret = -EIO; + break; + } + } + } + +out: + put_page(src_page); + return ret; +} + +static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct tdx_gmem_post_populate_arg arg; + long gmem_ret; + int ret; + + if (tdx->state != VCPU_TD_STATE_INITIALIZED) + return -EINVAL; + + guard(mutex)(&kvm->slots_lock); + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) + return -EFAULT; + + if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || + !region.nr_pages || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !vt_is_tdx_private_gpa(kvm, region.gpa) || + !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) + return -EINVAL; + + kvm_mmu_reload(vcpu); + ret = 0; + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + arg = (struct tdx_gmem_post_populate_arg) { + .vcpu = vcpu, + .flags = cmd->flags, + }; + gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), + u64_to_user_ptr(region.source_addr), + 1, tdx_gmem_post_populate, &arg); + if (gmem_ret < 0) { + ret = gmem_ret; + break; + } + + if (gmem_ret != 1) { + ret = -EIO; + break; + } + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + + cond_resched(); + } + + if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) + ret = -EFAULT; + return ret; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (copy_from_user(&cmd, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.hw_error) + return -EINVAL; + + switch (cmd.id) { + case KVM_TDX_INIT_VCPU: + ret = tdx_vcpu_init(vcpu, &cmd); + break; + case KVM_TDX_INIT_MEM_REGION: + ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; + case KVM_TDX_GET_CPUID: + ret = tdx_vcpu_get_cpuid(vcpu, &cmd); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return PG_LEVEL_4K; +} + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static int tdx_offline_cpu(unsigned int cpu) +{ + int i; + + /* No TD is running. Allow any cpu to be offline. */ + if (!atomic_read(&nr_configured_hkid)) + return 0; + + /* + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory + * controller with pconfig. If we have active TDX HKID, refuse to + * offline the last online cpu. + */ + for_each_online_cpu(i) { + /* + * Found another online cpu on the same package. + * Allow to offline. + */ + if (i != cpu && topology_physical_package_id(i) == + topology_physical_package_id(cpu)) + return 0; + } + + /* + * This is the last cpu of this package. Don't offline it. + * + * Because it's hard for human operator to understand the + * reason, warn it. + */ +#define MSG_ALLPKG_ONLINE \ + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); + return -EBUSY; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, tdx_offline_cpu); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + const struct tdx_sys_info_td_conf *td_conf; + int r, i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { + /* + * Check if MSRs (tdx_uret_msrs) can be saved/restored + * before returning to user space. + * + * this_cpu_ptr(user_return_msrs)->registered isn't checked + * because the registration is done at vcpu runtime by + * tdx_user_return_msr_update_cache(). + */ + tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); + if (tdx_uret_msrs[i].slot == -1) { + /* If any MSR isn't supported, it is a KVM bug */ + pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", + tdx_uret_msrs[i].msr); + return -EIO; + } + } + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* Get TDX global information for later use */ + tdx_sysinfo = tdx_get_sysinfo(); + if (WARN_ON_ONCE(!tdx_sysinfo)) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* Check TDX module and KVM capabilities */ + if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || + !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) + goto get_sysinfo_err; + + if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) + goto get_sysinfo_err; + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to + * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU + * extension on per-VM basis. + * + * TDX module reports such limit via the MAX_VCPU_PER_TD global + * metadata. Different modules may report different values. + * Some old module may also not support this metadata (in which + * case this limit is U16_MAX). + * + * In practice, the reported value reflects the maximum logical + * CPUs that ALL the platforms that the module supports can + * possibly have. + * + * Simply forwarding the MAX_VCPU_PER_TD to userspace could + * result in an unpredictable ABI. KVM instead always advertise + * the number of logical CPUs the platform has as the maximum + * vCPUs for TDX guests. + * + * Make sure MAX_VCPU_PER_TD reported by TDX module is not + * smaller than the number of logical CPUs, otherwise KVM will + * report an unsupported value to userspace. + * + * Note, a platform with TDX enabled in the BIOS cannot support + * physical CPU hotplug, and TDX requires the BIOS has marked + * all logical CPUs in MADT table as enabled. Just use + * num_present_cpus() for the number of logical CPUs. + */ + td_conf = &tdx_sysinfo->td_conf; + if (td_conf->max_vcpus_per_td < num_present_cpus()) { + pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", + td_conf->max_vcpus_per_td, num_present_cpus()); + r = -EINVAL; + goto get_sysinfo_err; + } + + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; + +get_sysinfo_err: + __tdx_cleanup(); +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r, i; + + /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); + + if (!enable_tdx) + return 0; + + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + + if (!enable_apicv) { + pr_err("APICv is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + pr_err("tdx: OSXSAVE is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_err("tdx: MOVDIR64B is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if the TDX + * module could not be loaded. No need to print message saying + * "module is not loaded" because it was printed when the first + * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or + * vm_size, as kvm_x86_ops have already been finalized (and are + * intentionally not exported). The S-EPT code is unreachable, + * and allocating a few more bytes per VM in a should-be-rare + * failure scenario is a non-issue. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} + +void __init tdx_hardware_setup(void) +{ + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); + + /* + * Note, if the TDX module can't be loaded, KVM TDX support will be + * disabled but KVM will continue loading (see tdx_bringup()). + */ + vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); + + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..ca39a9391db1 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#include "tdx_arch.h" +#include "tdx_errno.h" + +#ifdef CONFIG_KVM_INTEL_TDX +#include "common.h" + +void tdx_hardware_setup(void); +int tdx_bringup(void); +void tdx_cleanup(void); + +extern bool enable_tdx; + +/* TDX module hardware states. These follow the TDX module OP_STATEs. */ +enum kvm_tdx_state { + TD_STATE_UNINITIALIZED = 0, + TD_STATE_INITIALIZED, + TD_STATE_RUNNABLE, +}; + +struct kvm_tdx { + struct kvm kvm; + + struct misc_cg *misc_cg; + int hkid; + enum kvm_tdx_state state; + + u64 attributes; + u64 xfam; + + u64 tsc_offset; + u64 tsc_multiplier; + + struct tdx_td td; + + /* For KVM_TDX_INIT_MEM_REGION. */ + atomic64_t nr_premapped; + + /* + * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do + * not contend with tdh_vp_enter() and TDCALLs. + * Set/unset is protected with kvm->mmu_lock. + */ + bool wait_for_sept_zap; +}; + +/* TDX module vCPU states */ +enum vcpu_tdx_state { + VCPU_TD_STATE_UNINITIALIZED = 0, + VCPU_TD_STATE_INITIALIZED, +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; + struct vcpu_vt vt; + u64 ext_exit_qualification; + gpa_t exit_gpa; + struct tdx_module_args vp_enter_args; + + struct tdx_vp vp; + + struct list_head cpu_list; + + u64 vp_enter_ret; + + enum vcpu_tdx_state state; + bool guest_entered; + + u64 map_gpa_next; + u64 map_gpa_end; +}; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) +{ + u64 err, data; + + err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data); + if (unlikely(err)) { + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err); + return 0; + } + return data; +} + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +static __always_inline void tdvps_management_check(u64 field, u8 bits) {} +static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu); +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err); + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); + +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); +TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); + +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} + +#define enable_tdx 0 + +struct kvm_tdx { + struct kvm kvm; +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; +}; + +static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; } +static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; } + +#endif + +#endif diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h new file mode 100644 index 000000000000..a30e880849e3 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural constants/data definitions for TDX SEAMCALLs */ + +#ifndef __KVM_X86_TDX_ARCH_H +#define __KVM_X86_TDX_ARCH_H + +#include <linux/types.h> + +/* TDX control structure (TDR/TDCS/TDVPS) field access codes */ +#define TDX_NON_ARCH BIT_ULL(63) +#define TDX_CLASS_SHIFT 56 +#define TDX_FIELD_MASK GENMASK_ULL(31, 0) + +#define __BUILD_TDX_FIELD(non_arch, class, field) \ + (((non_arch) ? TDX_NON_ARCH : 0) | \ + ((u64)(class) << TDX_CLASS_SHIFT) | \ + ((u64)(field) & TDX_FIELD_MASK)) + +#define BUILD_TDX_FIELD(class, field) \ + __BUILD_TDX_FIELD(false, (class), (field)) + +#define BUILD_TDX_FIELD_NON_ARCH(class, field) \ + __BUILD_TDX_FIELD(true, (class), (field)) + + +/* Class code for TD */ +#define TD_CLASS_EXECUTION_CONTROLS 17ULL + +/* Class code for TDVPS */ +#define TDVPS_CLASS_VMCS 0ULL +#define TDVPS_CLASS_GUEST_GPR 16ULL +#define TDVPS_CLASS_OTHER_GUEST 17ULL +#define TDVPS_CLASS_MANAGEMENT 32ULL + +enum tdx_tdcs_execution_control { + TD_TDCS_EXEC_TSC_OFFSET = 10, + TD_TDCS_EXEC_TSC_MULTIPLIER = 11, +}; + +enum tdx_vcpu_guest_other_state { + TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100, +}; + +#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0) + +static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details) +{ + return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING); +} + +/* @field is any of enum tdx_tdcs_execution_control */ +#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) + +/* @field is the VMCS field encoding */ +#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field)) + +/* @field is any of enum tdx_guest_other_state */ +#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field)) +#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field)) + +/* Management class fields */ +enum tdx_vcpu_guest_management { + TD_VCPU_PEND_NMI = 11, +}; + +/* @field is any of enum tdx_vcpu_guest_management */ +#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field)) + +#define TDX_EXTENDMR_CHUNKSIZE 256 + +struct tdx_cpuid_value { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + +#define TDX_TD_ATTR_DEBUG BIT_ULL(0) +#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28) +#define TDX_TD_ATTR_PKS BIT_ULL(30) +#define TDX_TD_ATTR_KL BIT_ULL(31) +#define TDX_TD_ATTR_PERFMON BIT_ULL(63) + +#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0) +#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6 +/* + * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. + */ +struct td_params { + u64 attributes; + u64 xfam; + u16 max_vcpus; + u8 reserved0[6]; + + u64 eptp_controls; + u64 config_flags; + u16 tsc_frequency; + u8 reserved1[38]; + + u64 mrconfigid[6]; + u64 mrowner[6]; + u64 mrownerconfig[6]; + u64 reserved2[4]; + + union { + DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values); + u8 reserved3[768]; + }; +} __packed __aligned(1024); + +/* + * Guest uses MAX_PA for GPAW when set. + * 0: GPA.SHARED bit is GPA[47] + * 1: GPA.SHARED bit is GPA[51] + */ +#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0) + +/* + * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP + * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered. + * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved. + */ +#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2) + + +/* + * TDX requires the frequency to be defined in units of 25MHz, which is the + * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX + * module can only program frequencies that are multiples of 25MHz. The + * frequency must be between 100mhz and 10ghz (inclusive). + */ +#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000)) +#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000)) +#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) +#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) + +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + +#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) + +/* + * TD scope metadata field ID. + */ +#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL + +#endif /* __KVM_X86_TDX_ARCH_H */ diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h new file mode 100644 index 000000000000..6ff4672c4181 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_errno.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural status code for SEAMCALL */ + +#ifndef __KVM_X86_TDX_ERRNO_H +#define __KVM_X86_TDX_ERRNO_H + +#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL + +/* + * TDX SEAMCALL Status Codes (returned in RAX) + */ +#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL +#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL +#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL +#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL +#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_OPERAND_INVALID 0xC000010000000000ULL +#define TDX_OPERAND_BUSY 0x8000020000000000ULL +#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL +#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL +#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL +#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL +#define TDX_KEY_CONFIGURED 0x0000081500000000ULL +#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL +#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL +#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL +#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL +#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL + +/* + * TDX module operand ID, appears in 31:0 part of error code as + * detail information + */ +#define TDX_OPERAND_ID_RCX 0x01 +#define TDX_OPERAND_ID_TDR 0x80 +#define TDX_OPERAND_ID_SEPT 0x92 +#define TDX_OPERAND_ID_TD_EPOCH 0xa9 + +#endif /* __KVM_X86_TDX_ERRNO_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f6986dee6f8c..0a6cf5bff2aa 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -59,8 +59,7 @@ * without the explicit restore, thinks the stack is getting walloped. * Using an unwind hint is problematic due to x86-64's dynamic alignment. */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP + leave RET .endm diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 157c23db22be..aa157fe5b7b3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -54,6 +54,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -74,6 +75,8 @@ #include "vmx_onhyperv.h" #include "posted_intr.h" +#include "mmu/spte.h" + MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); @@ -112,10 +115,10 @@ static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); module_param(enable_apicv, bool, 0444); - -bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -165,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); RTIT_STATUS_BYTECNT)) /* - * List of MSRs that can be directly passed to the guest. - * In addition to these x2apic, PT and LBR MSRs are handled specially. - */ -static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { - MSR_IA32_SPEC_CTRL, - MSR_IA32_PRED_CMD, - MSR_IA32_FLUSH_CMD, - MSR_IA32_TSC, -#ifdef CONFIG_X86_64 - MSR_FS_BASE, - MSR_GS_BASE, - MSR_KERNEL_GS_BASE, - MSR_IA32_XFD, - MSR_IA32_XFD_ERR, -#endif - MSR_IA32_SYSENTER_CS, - MSR_IA32_SYSENTER_ESP, - MSR_IA32_SYSENTER_EIP, - MSR_CORE_C1_RES, - MSR_CORE_C3_RESIDENCY, - MSR_CORE_C6_RESIDENCY, - MSR_CORE_C7_RESIDENCY, -}; - -/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. @@ -671,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static int vmx_get_passthrough_msr_slot(u32 msr) -{ - int i; - - switch (msr) { - case 0x800 ... 0x8ff: - /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ - return -ENOENT; - case MSR_IA32_RTIT_STATUS: - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - case MSR_IA32_RTIT_CR3_MATCH: - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ - case MSR_LBR_SELECT: - case MSR_LBR_TOS: - case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: - case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: - case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: - case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: - case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: - /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ - return -ENOENT; - } - - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { - if (vmx_possible_passthrough_msrs[i] == msr) - return i; - } - - WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); - return -ENOENT; -} - struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -771,8 +715,11 @@ void vmx_emergency_disable_virtualization_cpu(void) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } kvm_cpu_vmxoff(); } @@ -957,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; + if (static_branch_unlikely(&cpu_buf_vm_clear) && + kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) + flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; + return flags; } @@ -1283,6 +1234,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1311,7 +1263,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1332,12 +1284,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -1349,14 +1301,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1384,10 +1336,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1395,7 +1347,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; @@ -1404,7 +1356,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; @@ -1443,8 +1395,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1471,17 +1422,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (static_branch_likely(&switch_vcpu_ibpb) && - (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1520,7 +1460,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); } @@ -1581,7 +1521,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } bool vmx_get_if_flag(struct kvm_vcpu *vcpu) @@ -1701,7 +1641,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } @@ -1716,7 +1656,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -1863,7 +1803,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) return; } - WARN_ON_ONCE(vmx->emulation_required); + WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -2154,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: @@ -2179,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } -static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { u64 debugctl = 0; @@ -2191,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (boot_cpu_has(X86_FEATURE_RTM) && + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) + debugctl |= DEBUGCTLMSR_RTM_DEBUG; + return debugctl; } +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) +{ + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); + if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); + } + return !invalid; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2262,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: { - u64 invalid; - - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - } - - if (invalid) + case MSR_IA32_DEBUGCTLMSR: + if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1; + data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; - } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -3406,7 +3355,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) @@ -3669,7 +3618,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4018,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) vmx->nested.force_msr_bitmap_recalc = true; } -void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int idx; if (!cpu_has_vmx_msr_bitmap()) return; vmx_msr_bitmap_l01_changed(vmx); - /* - * Mark the desired intercept state in shadow bitmap, this is needed - * for resync when the MSR filters change. - */ - idx = vmx_get_passthrough_msr_slot(msr); - if (idx >= 0) { - if (type & MSR_TYPE_R) - clear_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - clear_bit(idx, vmx->shadow_msr_intercept.write); - } - - if ((type & MSR_TYPE_R) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { - vmx_set_msr_bitmap_read(msr_bitmap, msr); - type &= ~MSR_TYPE_R; - } - - if ((type & MSR_TYPE_W) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { - vmx_set_msr_bitmap_write(msr_bitmap, msr); - type &= ~MSR_TYPE_W; + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); + else + vmx_set_msr_bitmap_read(msr_bitmap, msr); } - if (type & MSR_TYPE_R) - vmx_clear_msr_bitmap_read(msr_bitmap, msr); - - if (type & MSR_TYPE_W) - vmx_clear_msr_bitmap_write(msr_bitmap, msr); -} - -void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int idx; - - if (!cpu_has_vmx_msr_bitmap()) - return; - - vmx_msr_bitmap_l01_changed(vmx); - - /* - * Mark the desired intercept state in shadow bitmap, this is needed - * for resync when the MSR filter changes. - */ - idx = vmx_get_passthrough_msr_slot(msr); - if (idx >= 0) { - if (type & MSR_TYPE_R) - set_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - set_bit(idx, vmx->shadow_msr_intercept.write); + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); + else + vmx_set_msr_bitmap_write(msr_bitmap, msr); } - - if (type & MSR_TYPE_R) - vmx_set_msr_bitmap_read(msr_bitmap, msr); - - if (type & MSR_TYPE_W) - vmx_set_msr_bitmap_write(msr_bitmap, msr); } static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) @@ -4166,79 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 i; - if (!cpu_has_vmx_msr_bitmap()) return; - /* - * Redo intercept permissions for MSRs that KVM is passing through to - * the guest. Disabling interception will check the new MSR filter and - * ensure that KVM enables interception if usersepace wants to filter - * the MSR. MSRs that KVM is already intercepting don't need to be - * refreshed since KVM is going to intercept them regardless of what - * userspace wants. - */ - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { - u32 msr = vmx_possible_passthrough_msrs[i]; - - if (!test_bit(i, vmx->shadow_msr_intercept.read)) - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); - - if (!test_bit(i, vmx->shadow_msr_intercept.write)) - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); +#ifdef CONFIG_X86_64 + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } /* PT MSRs can be passed through iff PT is exposed to the guest. */ if (vmx_pt_mode_is_host_guest()) pt_update_intercept_for_msr(vcpu); -} -static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - int pi_vec) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of the virtual has already been set in the PIR. - * Send a notification event to deliver the virtual interrupt - * unless the vCPU is the currently running vCPU, i.e. the - * event is being sent from a fastpath VM-Exit handler, in - * which case the PIR will be synced to the vIRR before - * re-entering the guest. - * - * When the target is not the running vCPU, the following - * possibilities emerge: - * - * Case 1: vCPU stays in non-root mode. Sending a notification - * event posts the interrupt to the vCPU. - * - * Case 2: vCPU exits to root mode and is still runnable. The - * PIR will be synced to the vIRR before re-entering the guest. - * Sending a notification event is ok as the host IRQ handler - * will ignore the spurious event. - * - * Case 3: vCPU exits to root mode and is blocked. vcpu_block() - * has already synced PIR to vIRR and never blocks the vCPU if - * the vIRR is not empty. Therefore, a blocked vCPU here does - * not wait for any requested interrupts in PIR, and sending a - * notification event also results in a benign, spurious event. - */ + if (vcpu->arch.xfd_no_write_intercept) + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !to_vmx(vcpu)->spec_ctrl); + + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); + + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); + + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - if (vcpu != kvm_get_running_vcpu()) - __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return; - } -#endif /* - * The vCPU isn't in the guest; wake the vCPU in case it is blocking, - * otherwise do nothing as KVM will grab the highest priority pending - * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + * x2APIC and LBR MSR intercepts are modified on-demand and cannot be + * filtered by userspace. */ - kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4289,7 +4169,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4300,20 +4180,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return 0; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return 0; - - /* - * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() - * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is - * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a - * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. - */ - kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } @@ -4780,7 +4647,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4852,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + vmx_guest_debugctl_write(&vmx->vcpu, 0); if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); @@ -4893,8 +4761,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - __pi_set_sn(&vmx->pi_desc); + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5668,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ - lockdep_assert_irqs_disabled(); - set_debugreg(vcpu->arch.dr6, 6); -} - void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5811,11 +5673,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5831,23 +5690,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5859,7 +5701,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -5904,7 +5746,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->emulation_required) + if (!vmx->vt.emulation_required) return false; /* @@ -5936,7 +5778,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; - while (vmx->emulation_required && count-- != 0) { + while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -6131,7 +5973,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6229,9 +6071,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6527,7 +6369,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6583,7 +6425,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * the least awful solution for the userspace case without * risking false positives. */ - if (vmx->emulation_required) { + if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } @@ -6593,7 +6435,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } /* If guest state is invalid, start emulating. L2 is handled above. */ - if (vmx->emulation_required) + if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { @@ -6693,7 +6535,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6972,22 +6814,22 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -7027,14 +6869,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -7091,14 +6925,12 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->emulation_required) + if (to_vt(vcpu)->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } @@ -7333,10 +7165,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -7348,6 +7180,20 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, } } +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) +{ + if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || + !is_nmi(vmx_get_intr_info(vcpu))) + return; + + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); +} + static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { @@ -7368,8 +7214,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&cpu_buf_vm_clear) && - kvm_arch_has_assigned_device(vcpu->kvm)) - mds_clear_cpu_buffers(); + (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) + x86_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); @@ -7387,30 +7233,23 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && - is_nmi(vmx_get_intr_info(vcpu))) { - kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); - kvm_after_interrupt(vcpu); - } + vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); } -fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7424,15 +7263,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ - if (unlikely(vmx->emulation_required)) { + if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7455,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + + if (run_flags & KVM_RUN_LOAD_DEBUGCTL) + vmx_reload_guest_debugctl(vcpu); + /* * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time @@ -7535,7 +7380,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7544,12 +7389,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7581,7 +7426,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7629,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) evmcs->hv_enlightenments_control.msr_bitmap = 1; } - /* The MSR bitmap starts with all ones */ - bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); - bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); - - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); -#ifdef CONFIG_X86_64 - vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); -#endif - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); - } - vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { @@ -7679,7 +7504,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; @@ -7698,7 +7523,7 @@ free_vpid: int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { switch (l1tf_mitigation) { @@ -7724,9 +7549,23 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) +{ + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. + */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} + u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* @@ -7736,13 +7575,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - /* - * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached. Letting the guest control memory types on Intel - * CPUs may result in unexpected behavior, and so KVM's ABI is to trust - * the guest to behave only as a last resort. - */ - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); @@ -7926,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } - if (kvm_cpu_cap_has(X86_FEATURE_XFD)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); - - if (boot_cpu_has(X86_FEATURE_IBPB)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, - !guest_has_pred_cmd_msr(vcpu)); - - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); @@ -7953,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; + /* Recalc MSR interception to account for feature changes. */ + vmx_recalc_msr_intercepts(vcpu); + /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } @@ -8604,6 +8429,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID @@ -8621,9 +8448,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8681,6 +8505,27 @@ __init int vmx_hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } @@ -8694,26 +8539,21 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); -} -static void __exit vmx_exit(void) -{ - kvm_exit(); - __vmx_exit(); kvm_x86_vendor_exit(); - } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); + if (!kvm_is_vmx_supported()) return -EOPNOTSUPP; @@ -8754,21 +8594,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 951e44dc9d0e..d3389baf3ab3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,13 +11,13 @@ #include "capabilities.h" #include "../kvm_cache_regs.h" +#include "pmu_intel.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" #include "../mmu.h" - -#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) +#include "common.h" #ifdef CONFIG_X86_64 #define MAX_NR_USER_RETURN_MSRS 7 @@ -67,47 +67,6 @@ struct pt_desc { struct pt_ctx guest; }; -union vmx_exit_reason { - struct { - u32 basic : 16; - u32 reserved16 : 1; - u32 reserved17 : 1; - u32 reserved18 : 1; - u32 reserved19 : 1; - u32 reserved20 : 1; - u32 reserved21 : 1; - u32 reserved22 : 1; - u32 reserved23 : 1; - u32 reserved24 : 1; - u32 reserved25 : 1; - u32 bus_lock_detected : 1; - u32 enclave_mode : 1; - u32 smi_pending_mtf : 1; - u32 smi_from_vmx_root : 1; - u32 reserved30 : 1; - u32 failed_vmentry : 1; - }; - u32 full; -}; - -struct lbr_desc { - /* Basic info about guest LBR records. */ - struct x86_pmu_lbr records; - - /* - * Emulate LBR feature via passthrough LBR registers when the - * per-vcpu guest LBR event is scheduled on the current pcpu. - * - * The records may be inaccurate if the host reclaims the LBR. - */ - struct perf_event *event; - - /* True if LBRs are marked as not intercepted in the MSR bitmap */ - bool msr_passthrough; -}; - -extern struct x86_pmu_lbr vmx_lbr_caps; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -248,20 +207,10 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; - /* - * If true, host state has been stored in vmx->loaded_vmcs for - * the CPU registers that only need to be switched when transitioning - * to/from the kernel, and the registers have been loaded with guest - * values. If false, host state is loaded in the CPU registers - * and vmx->loaded_vmcs->host_state is invalid. - */ - bool guest_state_loaded; - - unsigned long exit_qualification; - u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -274,7 +223,6 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif @@ -313,15 +261,6 @@ struct vcpu_vmx { } seg[8]; } segment_cache; int vpid; - bool emulation_required; - - union vmx_exit_reason exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Used if this vCPU is waiting for PI notification wakeup. */ - struct list_head pi_wakeup_list; /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -355,13 +294,6 @@ struct vcpu_vmx { struct pt_desc pt_desc; struct lbr_desc lbr_desc; - /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16 - struct { - DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); - DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); - } shadow_msr_intercept; - /* ve_info must be page aligned. */ struct vmx_ve_information *ve_info; }; @@ -376,8 +308,44 @@ struct kvm_vmx { u64 *pid_table; }; -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy); +static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) +{ + return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); +} + +static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) +{ + return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); +} + +static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) +{ + return to_vt(vcpu)->exit_reason; +} + +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + return vt->exit_qualification; +} + +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + return vt->exit_intr_info; +} + +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); @@ -418,24 +386,54 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); -void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); -void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); + +static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + vmx_set_intercept_for_msr(vcpu, msr, type, false); +} + +static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type) +{ + vmx_set_intercept_for_msr(vcpu, msr, type, true); +} u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); -static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, - int type, bool value) +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + +#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM) + +static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) +{ + WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS); + + val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS; + vmcs_write64(GUEST_IA32_DEBUGCTL, val); +} + +static inline u64 vmx_guest_debugctl_read(void) { - if (value) - vmx_enable_intercept_for_msr(vcpu, msr, type); - else - vmx_disable_intercept_for_msr(vcpu, msr, type); + return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS; } -void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) +{ + u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS)) + return; + + vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS); +} /* * Note, early Intel manuals have the write-low and read-high bitmap offsets @@ -662,45 +660,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) -{ - return &to_vmx(vcpu)->lbr_desc; -} - -static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) -{ - return &vcpu_to_lbr_desc(vcpu)->records; -} - -static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) -{ - return !!vcpu_to_lbr_records(vcpu)->nr; -} - void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) - vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - return vmx->exit_qualification; -} - -static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - return vmx->exit_intr_info; -} - struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -758,4 +721,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) vmx->segment_cache.bitmask = 0; } +int vmx_init(void); +void vmx_exit(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 430773a5ef8e..2b3424f638db 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); int vmx_vcpu_create(struct kvm_vcpu *vcpu); int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); -fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); void vmx_vcpu_free(struct kvm_vcpu *vcpu); void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); @@ -46,18 +46,18 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); void vmx_migrate_timers(struct kvm_vcpu *vcpu); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); -void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); +void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#define vmx_complete_emulated_msr kvm_complete_insn_gp u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -121,4 +121,39 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); #endif void vmx_setup_mce(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_INTEL_TDX +void tdx_disable_virtualization_cpu(void); +int tdx_vm_init(struct kvm *kvm); +void tdx_mmu_release_hkid(struct kvm *kvm); +void tdx_vm_destroy(struct kvm *kvm); +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); + +int tdx_vcpu_create(struct kvm_vcpu *vcpu); +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void tdx_vcpu_free(struct kvm_vcpu *vcpu); +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void tdx_vcpu_put(struct kvm_vcpu *vcpu); +int tdx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath); + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void tdx_inject_nmi(struct kvm_vcpu *vcpu); +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +bool tdx_has_emulated_msr(u32 index); +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +#endif + #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5bdb5b854924..a1c49bc681c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,7 +90,6 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 /* * Note, kvm_caps fields should *never* have default values, all fields must be @@ -227,6 +226,12 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); +bool __read_mostly enable_ipiv = true; +EXPORT_SYMBOL_GPL(enable_ipiv); + +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_GPL(enable_device_posted_irqs); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -636,6 +641,15 @@ static void kvm_user_return_msr_cpu_online(void) } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -649,15 +663,20 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); +} +EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -3242,9 +3261,11 @@ int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_caps.has_tsc_control) + if (kvm_caps.has_tsc_control) { tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); + tgt_tsc_khz = tgt_tsc_khz ? : 1; + } if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, @@ -4561,6 +4582,9 @@ static u64 kvm_get_allowed_disable_exits(void) { u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + r |= KVM_X86_DISABLE_EXITS_APERFMPERF; + if (!mitigate_smt_rsb) { r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_CSTATE; @@ -4616,17 +4640,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: +#ifdef CONFIG_KVM_IOAPIC case KVM_CAP_PIT: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_REINJECT_CONTROL: +#endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: #ifdef CONFIG_KVM_HYPERV @@ -4739,6 +4766,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; @@ -4794,7 +4823,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; @@ -4965,16 +4994,13 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -4991,12 +5017,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_x86_call(has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpu(vcpu->cpu); } kvm_x86_call(vcpu_load)(vcpu, cpu); + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } + /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -5117,6 +5155,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); @@ -5127,6 +5168,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -5448,12 +5492,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; - /* INITs are latched while in SMM */ - if (events->flags & KVM_VCPUEVENT_VALID_SMM && - (events->smi.smm || events->smi.pending) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) - return -EINVAL; - process_nmi(vcpu); /* @@ -6147,6 +6185,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; + + if (vcpu->arch.guest_tsc_protected) + goto out; + user_tsc_khz = (u32)arg; if (kvm_caps.has_tsc_control && @@ -6304,6 +6346,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -6350,135 +6398,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, &pic->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, &pic->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic->lock); - memcpy(&pic->pics[0], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic->lock); - memcpy(&pic->pics[1], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_set_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic); - return r; -} - -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -6491,25 +6410,13 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, - bool line_status) -{ - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level, - line_status); - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -6521,11 +6428,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -6574,17 +6481,11 @@ split_irqchip_unlock: if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_APERFMPERF))) pr_warn_once(SMT_RSB_MSG); - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) - kvm->arch.pause_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; + kvm_disable_exits(kvm, cap->args[0]); r = 0; disable_exits_unlock: mutex_unlock(&kvm->lock); @@ -7021,9 +6922,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; + +#ifdef CONFIG_KVM_IOAPIC /* * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be + * that these three variables' stack usage should be * combined, not added together. */ union { @@ -7031,6 +6934,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; +#endif switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -7054,6 +6958,7 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; +#ifdef CONFIG_KVM_IOAPIC case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7075,7 +6980,7 @@ set_identity_unlock: goto create_irqchip_unlock; } - r = kvm_setup_default_irq_routing(kvm); + r = kvm_setup_default_ioapic_and_pic_routing(kvm); if (r) { kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); @@ -7123,7 +7028,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -7147,7 +7052,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); set_irqchip_out: @@ -7220,6 +7125,7 @@ set_pit2_out: r = kvm_vm_ioctl_reinject(kvm, &control); break; } +#endif case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); @@ -7290,23 +7196,25 @@ set_pit2_out: if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; - WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); - r = 0; - + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); + r = 0; + } + mutex_unlock(&kvm->lock); goto out; } case KVM_GET_TSC_KHZ: { r = READ_ONCE(kvm->arch.default_tsc_khz); goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) goto out; r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -8000,7 +7908,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -8245,8 +8153,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); } else @@ -9338,7 +9245,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9363,7 +9270,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9376,7 +9283,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9405,7 +9312,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -9771,6 +9678,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrq_safe(MSR_EFER, &kvm_host.efer); @@ -9786,6 +9695,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { @@ -9815,6 +9727,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; @@ -10023,13 +9939,16 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)) { unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); ++vcpu->stat.hypercalls; @@ -10132,9 +10051,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi, - is_64_bit_hypercall(vcpu), - kvm_x86_call(get_cpl)(vcpu), + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), complete_hypercall_exit); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -10664,13 +10581,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; kvm_x86_call(sync_pir_to_irr)(vcpu); if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); +#ifdef CONFIG_KVM_IOAPIC else if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); +#endif if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -10724,6 +10644,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); fastpath_t exit_fastpath; + u64 run_flags, debug_ctl; bool req_immediate_exit = false; @@ -10871,8 +10792,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + + /* + * Recalc MSR intercepts as userspace may want to intercept + * accesses to MSRs that KVM would otherwise pass through to + * the guest. + */ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_call(msr_filter_changed)(vcpu); + kvm_x86_call(recalc_msr_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) kvm_x86_call(update_cpu_dirty_logging)(vcpu); @@ -10968,8 +10895,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - if (req_immediate_exit) + run_flags = 0; + if (req_immediate_exit) { + run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; kvm_make_request(KVM_REQ_EVENT, vcpu); + } fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -10978,20 +10908,31 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); + run_flags |= KVM_RUN_LOAD_GUEST_DR6; } else if (unlikely(hw_breakpoint_active())) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); } - vcpu->arch.host_debugctl = get_debugctlmsr(); + /* + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL + * can be modified in IRQ context, e.g. via SMP function calls. Inform + * vendor code if any host-owned bits were changed, e.g. so that the + * value loaded into hardware while running the guest can be updated. + */ + debug_ctl = get_debugctlmsr(); + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && + !vcpu->arch.guest_state_protected) + run_flags |= KVM_RUN_LOAD_DEBUGCTL; + vcpu->arch.host_debugctl = debug_ctl; guest_timing_enter_irqoff(); @@ -11005,8 +10946,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, - req_immediate_exit); + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -11018,6 +10958,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) break; } + run_flags = 0; + /* Note, VM-Exits that go down the "slow" path are accounted below. */ ++vcpu->stat.exits; } @@ -11030,6 +10972,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); @@ -11134,7 +11077,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11143,9 +11086,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11183,10 +11123,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11320,7 +11262,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) state = KVM_MP_STATE_RUNNABLE; kvm_set_mp_state(vcpu, state); return 1; @@ -11491,6 +11433,28 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } +static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + /* + * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and + * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted + * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be + * converted to INIT_RECEIVED. + */ + if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) + return -EINVAL; + + /* + * Disallow running the vCPU if userspace forced it into an impossible + * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked. + */ + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED && + !kvm_apic_init_sipi_allowed(vcpu)) + return -EINVAL; + + return kvm_x86_call(vcpu_pre_run)(vcpu); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; @@ -11593,7 +11557,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - r = kvm_x86_call(vcpu_pre_run)(vcpu); + r = kvm_x86_vcpu_pre_run(vcpu); if (r <= 0) goto out; @@ -11837,21 +11801,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, } /* - * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow - * forcing the guest into INIT/SIPI if those events are supposed to be - * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state - * if an SMI is pending as well. + * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead + * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI. + * Translate SIPI_RECEIVED as appropriate for backwards compatibility. */ - if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && - (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || - mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) - goto out; - if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); + mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); - } else - kvm_set_mp_state(vcpu, mp_state->mp_state); + } + + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -12388,13 +12347,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; kvm_clear_async_pf_completion_queue(vcpu); kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); @@ -12694,6 +12656,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12723,26 +12686,22 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) goto out; - kvm_mmu_init_vm(kvm); + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_cleanup_page_track; ret = kvm_x86_call(vm_init)(kvm); if (ret) goto out_uninit_mmu; - INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); atomic_set(&kvm->arch.noncoherent_dma_count, 0); - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); - raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); @@ -12781,6 +12740,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) out_uninit_mmu: kvm_mmu_uninit_vm(kvm); +out_cleanup_page_track: kvm_page_track_cleanup(kvm); out: return ret; @@ -12873,9 +12833,12 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); +#ifdef CONFIG_KVM_IOAPIC kvm_free_pit(kvm); +#endif kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -12896,8 +12859,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); +#ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); +#endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); @@ -13073,7 +13038,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13145,7 +13110,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { @@ -13507,25 +13472,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } -void kvm_arch_start_assignment(struct kvm *kvm) -{ - if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - kvm_x86_call(pi_start_assignment)(kvm); -} -EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); - -void kvm_arch_end_assignment(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); - -bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return raw_atomic_read(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); - static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) { /* @@ -13534,8 +13480,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (kvm_mmu_may_ignore_guest_pat()) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -13559,77 +13507,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - struct kvm *kvm = irqfd->kvm; - int ret; - - kvm_arch_start_assignment(irqfd->kvm); - - spin_lock_irq(&kvm->irqfds.lock); - irqfd->producer = prod; - - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - if (ret) - kvm_arch_end_assignment(irqfd->kvm); - - spin_unlock_irq(&kvm->irqfds.lock); - - - return ret; -} - -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - int ret; - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - struct kvm *kvm = irqfd->kvm; - - WARN_ON(irqfd->producer != prod); - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - spin_lock_irq(&kvm->irqfds.lock); - irqfd->producer = NULL; - - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); - - spin_unlock_irq(&kvm->irqfds.lock); - - - kvm_arch_end_assignment(irqfd->kvm); -} - -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); -} - -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - bool kvm_vector_hashing_enabled(void) { return vector_hashing; @@ -14012,6 +13889,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -14028,7 +13906,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9dc32a409076..bcfd9b719ada 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,6 +10,8 @@ #include "kvm_emulate.h" #include "cpuid.h" +#define KVM_MAX_MCE_BANKS 32 + struct kvm_caps { /* control of guest tsc rate supported? */ bool has_tsc_control; @@ -32,6 +34,9 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + + u64 supported_quirks; + u64 inapplicable_quirks; }; struct kvm_host_values { @@ -50,6 +55,28 @@ struct kvm_host_values { void kvm_spurious_fault(void); +#define SIZE_OF_MEMSLOTS_HASHTABLE \ + (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES) + +/* Sanity check the size of the memslot hash tables. */ +static_assert(SIZE_OF_MEMSLOTS_HASHTABLE == + (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM)))); + +/* + * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation. + * Spilling over to an order-2 allocation isn't fundamentally problematic, but + * isn't expected to happen in the foreseeable future (O(years)). Assert that + * the size is an order-0 allocation when ignoring the memslot hash tables, to + * help detect and debug unexpected size increases. + */ +#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \ +do { \ + BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \ + !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \ + BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \ + !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \ +} while (0) + #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ ({ \ bool failed = (consistency_check); \ @@ -116,6 +143,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->leave_nested(vcpu); } +/* + * If IBRS is advertised to the vCPU, KVM must flush the indirect branch + * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in + * this case) to provide separate predictor modes. Bare metal isolates the host + * from the guest, but doesn't isolate different guests from one another (in + * this case L1 and L2). The exception is if bare metal supports same mode IBRS, + * which offers protection within the same mode, and hence protects L1 from L2. + */ +static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) +{ + if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE)) + return; + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS)) + indirect_branch_prediction_barrier(); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; @@ -476,24 +521,34 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline void kvm_disable_exits(struct kvm *kvm, u64 mask) +{ + kvm->arch.disabled_exits |= mask; +} + static inline bool kvm_mwait_in_guest(struct kvm *kvm) { - return kvm->arch.mwait_in_guest; + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT; } static inline bool kvm_hlt_in_guest(struct kvm *kvm) { - return kvm->arch.hlt_in_guest; + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT; } static inline bool kvm_pause_in_guest(struct kvm *kvm) { - return kvm->arch.pause_in_guest; + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE; } static inline bool kvm_cstate_in_guest(struct kvm *kvm) { - return kvm->arch.cstate_in_guest; + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE; +} + +static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm) +{ + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF; } static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) @@ -629,25 +684,17 @@ static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)); -#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, complete_hypercall) \ -({ \ - int __ret; \ - \ - __ret = ____kvm_emulate_hypercall(_vcpu, \ - kvm_##nr##_read(_vcpu), kvm_##a0##_read(_vcpu), \ - kvm_##a1##_read(_vcpu), kvm_##a2##_read(_vcpu), \ - kvm_##a3##_read(_vcpu), op_64_bit, cpl, \ - complete_hypercall); \ - \ - if (__ret > 0) \ - __ret = complete_hypercall(_vcpu); \ - __ret; \ +#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \ +({ \ + int __ret; \ + __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \ + \ + if (__ret > 0) \ + __ret = complete_hypercall(_vcpu); \ + __ret; \ }) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 38b33cdd4232..d6b2a665b499 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1526,7 +1526,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports, sched_poll.nr_ports * sizeof(*ports), &e)) { *r = -EFAULT; - return true; + goto out; } for (i = 0; i < sched_poll.nr_ports; i++) { @@ -1571,7 +1571,8 @@ out: static void cancel_evtchn_poll(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, + arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -1970,8 +1971,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) - return -EINVAL; + /* + * Don't check for the port being within range of max_evtchn_port(). + * Userspace can configure what ever targets it likes; events just won't + * be delivered if/while the target is invalid, just like userspace can + * configure MSIs which target non-existent APICs. + * + * This allow on Live Migration and Live Update, the IRQ routing table + * can be restored *independently* of other things like creating vCPUs, + * without imposing an ordering dependency on userspace. In this + * particular case, the problematic ordering would be with setting the + * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096 + * instead of 1024 event channels. + */ /* We only support 2 level event channels for now */ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore index 8ae0f93ecbfd..ec2131c9fd20 100644 --- a/arch/x86/lib/.gitignore +++ b/arch/x86/lib/.gitignore @@ -1,2 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ + inat-tables.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4fa5c4e1ba8a..2dba7f83ef97 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for x86 specific library files. # -obj-y += crypto/ - # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n @@ -40,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o -obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o -crc32-x86-y := crc32.o crc32-pclmul.o -crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o - -obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o -crc64-x86-y := crc64.o crc64-pclmul.o - -obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o -crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o - obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 7af743bd3b13..c5c60d07308c 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu) } EXPORT_SYMBOL(wbinvd_on_cpu); -int wbinvd_on_all_cpus(void) +void wbinvd_on_all_cpus(void) { on_each_cpu(__wbinvd, NULL, 1); - return 0; } EXPORT_SYMBOL(wbinvd_on_all_cpus); + +void wbinvd_on_cpus_mask(struct cpumask *cpus) +{ + on_each_cpu_mask(cpus, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask); + +static void __wbnoinvd(void *dummy) +{ + wbnoinvd(); +} + +void wbnoinvd_on_all_cpus(void) +{ + on_each_cpu(__wbnoinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus); + +void wbnoinvd_on_cpus_mask(struct cpumask *cpus) +{ + on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask); diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h deleted file mode 100644 index fcc63c064333..000000000000 --- a/arch/x86/lib/crc-pclmul-consts.h +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * CRC constants generated by: - * - * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 - * - * Do not edit manually. - */ - -/* - * CRC folding constants generated for most-significant-bit-first CRC-16 using - * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 - */ -static const struct { - u8 bswap_mask[16]; - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { - .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - .fold_across_2048_bits_consts = { - 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ - 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ - }, - .fold_across_1024_bits_consts = { - 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ - 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ - }, - .fold_across_512_bits_consts = { - 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ - 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ - }, - .fold_across_256_bits_consts = { - 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ - 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ - }, - .fold_across_128_bits_consts = { - 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ - 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ - 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ - }, -}; - -/* - * CRC folding constants generated for least-significant-bit-first CRC-32 using - * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + - * x^5 + x^4 + x^2 + x^1 + x^0 - */ -static const struct { - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { - .fold_across_2048_bits_consts = { - 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ - 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ - }, - .fold_across_1024_bits_consts = { - 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ - 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ - }, - .fold_across_512_bits_consts = { - 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ - 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ - }, - .fold_across_256_bits_consts = { - 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ - 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ - }, - .fold_across_128_bits_consts = { - 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ - 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ - 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ - }, -}; - -/* - * CRC folding constants generated for most-significant-bit-first CRC-64 using - * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x^1 + x^0 - */ -static const struct { - u8 bswap_mask[16]; - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = { - .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - .fold_across_2048_bits_consts = { - 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */ - 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */ - }, - .fold_across_1024_bits_consts = { - 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */ - 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */ - }, - .fold_across_512_bits_consts = { - 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */ - 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */ - }, - .fold_across_256_bits_consts = { - 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */ - 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */ - }, - .fold_across_128_bits_consts = { - 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */ - 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */ - 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */ - }, -}; - -/* - * CRC folding constants generated for least-significant-bit-first CRC-64 using - * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + - * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + - * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + - * x^4 + x^3 + x^0 - */ -static const struct { - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = { - .fold_across_2048_bits_consts = { - 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */ - 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */ - }, - .fold_across_1024_bits_consts = { - 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */ - 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */ - }, - .fold_across_512_bits_consts = { - 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */ - 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */ - }, - .fold_across_256_bits_consts = { - 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */ - 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */ - }, - .fold_across_128_bits_consts = { - 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */ - 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */ - 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */ - }, -}; diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S deleted file mode 100644 index ae0b6144c503..000000000000 --- a/arch/x86/lib/crc-pclmul-template.S +++ /dev/null @@ -1,582 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// -// Template to generate [V]PCLMULQDQ-based CRC functions for x86 -// -// Copyright 2025 Google LLC -// -// Author: Eric Biggers <ebiggers@google.com> - -#include <linux/linkage.h> -#include <linux/objtool.h> - -// Offsets within the generated constants table -.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only -.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 -.set OFFSETOF_SHUF_TABLE, 1*16 -.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 - -// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the -// corresponding non-VEX instruction plus any needed moves. The supported -// instruction formats are: -// -// - Two-arg [src, dst], where the non-VEX format is the same. -// - Three-arg [src1, src2, dst] where the non-VEX format is -// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. -// -// \insn gives the instruction without a "v" prefix and including any immediate -// argument if needed to make the instruction follow one of the above formats. -// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to -// it first; this is needed when \arg1 is an unaligned mem operand. -.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp -.if AVX_LEVEL == 0 - // VEX not allowed. Emulate it. - .ifnb \arg3 // Three-arg [src1, src2, dst] - .ifc "\arg2", "\arg3" // src2 == dst? - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - \insn \unaligned_mem_tmp, \arg3 - .else - \insn \arg1, \arg3 - .endif - .else // src2 != dst - .ifc "\arg1", "\arg3" - .error "Can't have src1 == dst when src2 != dst" - .endif - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - movdqa \arg2, \arg3 - \insn \unaligned_mem_tmp, \arg3 - .else - movdqa \arg2, \arg3 - \insn \arg1, \arg3 - .endif - .endif - .else // Two-arg [src, dst] - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - \insn \unaligned_mem_tmp, \arg2 - .else - \insn \arg1, \arg2 - .endif - .endif -.else - // VEX is allowed. Emit the desired instruction directly. - .ifnb \arg3 - v\insn \arg1, \arg2, \arg3 - .else - v\insn \arg1, \arg2 - .endif -.endif -.endm - -// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector -// register of length VL. -.macro _vbroadcast src, dst -.if VL == 16 - _cond_vex movdqa, \src, \dst -.elseif VL == 32 - vbroadcasti128 \src, \dst -.else - vbroadcasti32x4 \src, \dst -.endif -.endm - -// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC -// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. -.macro _load_data vl, src, bswap_mask, dst -.if \vl < 64 - _cond_vex movdqu, "\src", \dst -.else - vmovdqu8 \src, \dst -.endif -.if !LSB_CRC - _cond_vex pshufb, \bswap_mask, \dst, \dst -.endif -.endm - -.macro _prepare_v0 vl, v0, v1, bswap_mask -.if LSB_CRC - .if \vl < 64 - _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 - .else - vpxorq (BUF), \v0, \v0 - .endif -.else - _load_data \vl, (BUF), \bswap_mask, \v1 - .if \vl < 64 - _cond_vex pxor, \v1, \v0, \v0 - .else - vpxorq \v1, \v0, \v0 - .endif -.endif -.endm - -// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for -// msb-first order or the physically high qword for lsb-first order -#define LO64_TERMS 0 - -// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high -// qword for msb-first order or the physically low qword for lsb-first order -#define HI64_TERMS 1 - -// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given -// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. -.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst - _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ - \src1, \src2, \dst -.endm - -// Fold \acc into \data and store the result back into \acc. \data can be an -// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no -// byte-reflection is needed; otherwise it must be a vector register. \consts -// is a vector register containing the needed fold constants, and \tmp is a -// temporary vector register. All arguments must be the same length. -.macro _fold_vec acc, data, consts, tmp - _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp - _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc -.if AVX_LEVEL <= 2 - _cond_vex pxor, \data, \tmp, \tmp - _cond_vex pxor, \tmp, \acc, \acc -.else - vpternlogq $0x96, \data, \tmp, \acc -.endif -.endm - -// Fold \acc into \data and store the result back into \acc. \data is an -// unaligned mem operand, \consts is a vector register containing the needed -// fold constants, \bswap_mask is a vector register containing the -// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are -// temporary vector registers. All arguments must have length \vl. -.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 -.if AVX_LEVEL == 0 || !LSB_CRC - _load_data \vl, \data, \bswap_mask, \tmp1 - _fold_vec \acc, \tmp1, \consts, \tmp2 -.else - _fold_vec \acc, \data, \consts, \tmp1 -.endif -.endm - -// Load the constants for folding across 2**i vectors of length VL at a time -// into all 128-bit lanes of the vector register CONSTS. -.macro _load_vec_folding_consts i - _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ - CONSTS -.endm - -// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store -// the result back into \v0. If the remaining length mod \vl is nonzero, also -// fold \vl data bytes from BUF. For both operations the fold distance is \vl. -// \consts must be a register of length \vl containing the fold constants. -.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 - _fold_vec \v0, \v1, \consts, \tmp1 - test $\vl, LEN8 - jz .Lfold_vec_final_done\@ - _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 - add $\vl, BUF -.Lfold_vec_final_done\@: -.endm - -// This macro generates the body of a CRC function with the following prototype: -// -// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); -// -// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. -// |buf| is the data to checksum. |len| is the data length in bytes, which must -// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts -// field of the constants struct that was generated for the chosen CRC variant. -// -// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. -// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If -// the file is compiled in i386 mode, then the maximum supported value is 32. -// -// \lsb_crc is 1 if the CRC processes the least significant bit of each byte -// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 -// if the CRC processes the most significant bit of each byte first, i.e. maps -// bit0 to x^0, bit1 to x^1, bit7 to x^7. -// -// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. -// -// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or -// 512 for AVX512. -// -// If \vl == 16 && \avx_level == 0, the generated code requires: -// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) -// -// If \vl == 32 && \avx_level == 2, the generated code requires: -// VPCLMULQDQ && AVX2. -// -// If \vl == 64 && \avx_level == 512, the generated code requires: -// VPCLMULQDQ && AVX512BW && AVX512VL. -// -// Other \vl and \avx_level combinations are either not supported or not useful. -.macro _crc_pclmul n, lsb_crc, vl, avx_level - .set LSB_CRC, \lsb_crc - .set VL, \vl - .set AVX_LEVEL, \avx_level - - // Define aliases for the xmm, ymm, or zmm registers according to VL. -.irp i, 0,1,2,3,4,5,6,7 - .if VL == 16 - .set V\i, %xmm\i - .set LOG2_VL, 4 - .elseif VL == 32 - .set V\i, %ymm\i - .set LOG2_VL, 5 - .elseif VL == 64 - .set V\i, %zmm\i - .set LOG2_VL, 6 - .else - .error "Unsupported vector length" - .endif -.endr - // Define aliases for the function parameters. - // Note: when crc_t is shorter than u32, zero-extension to 32 bits is - // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed - // when crc_t is shorter than u64. -#ifdef __x86_64__ -.if \n <= 32 - .set CRC, %edi -.else - .set CRC, %rdi -.endif - .set BUF, %rsi - .set LEN, %rdx - .set LEN32, %edx - .set LEN8, %dl - .set CONSTS_PTR, %rcx -#else - // 32-bit support, assuming -mregparm=3 and not including support for - // CRC-64 (which would use both eax and edx to pass the crc parameter). - .set CRC, %eax - .set BUF, %edx - .set LEN, %ecx - .set LEN32, %ecx - .set LEN8, %cl - .set CONSTS_PTR, %ebx // Passed on stack -#endif - - // Define aliases for some local variables. V0-V5 are used without - // aliases (for accumulators, data, temporary values, etc). Staying - // within the first 8 vector registers keeps the code 32-bit SSE - // compatible and reduces the size of 64-bit SSE code slightly. - .set BSWAP_MASK, V6 - .set BSWAP_MASK_YMM, %ymm6 - .set BSWAP_MASK_XMM, %xmm6 - .set CONSTS, V7 - .set CONSTS_YMM, %ymm7 - .set CONSTS_XMM, %xmm7 - - // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the - // functions generated by this macro are called only by static_call. - ANNOTATE_NOENDBR - -#ifdef __i386__ - push CONSTS_PTR - mov 8(%esp), CONSTS_PTR -#endif - - // Create a 128-bit vector that contains the initial CRC in the end - // representing the high-order polynomial coefficients, and the rest 0. - // If the CRC is msb-first, also load the byte-reflection table. -.if \n <= 32 - _cond_vex movd, CRC, %xmm0 -.else - _cond_vex movq, CRC, %xmm0 -.endif -.if !LSB_CRC - _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 - _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK -.endif - - // Load the first vector of data and XOR the initial CRC into the - // appropriate end of the first 128-bit lane of data. If LEN < VL, then - // use a short vector and jump ahead to the final reduction. (LEN >= 16 - // is guaranteed here but not necessarily LEN >= VL.) -.if VL >= 32 - cmp $VL, LEN - jae .Lat_least_1vec\@ - .if VL == 64 - cmp $32, LEN32 - jb .Lless_than_32bytes\@ - _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM - add $32, BUF - jmp .Lreduce_256bits_to_128bits\@ -.Lless_than_32bytes\@: - .endif - _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM - add $16, BUF - vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM - jmp .Lcheck_for_partial_block\@ -.Lat_least_1vec\@: -.endif - _prepare_v0 VL, V0, V1, BSWAP_MASK - - // Handle VL <= LEN < 4*VL. - cmp $4*VL-1, LEN - ja .Lat_least_4vecs\@ - add $VL, BUF - // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. - // If VL==16 then load fold_across_128_bits_consts first, as the final - // reduction depends on it and it won't be loaded anywhere else. - cmp $2*VL-1, LEN32 -.if VL == 16 - _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM -.endif - jbe .Lreduce_1vec_to_128bits\@ - // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to - // the reduction from 2 vectors. - _load_data VL, (BUF), BSWAP_MASK, V1 - add $VL, BUF - jmp .Lreduce_2vecs_to_1\@ - -.Lat_least_4vecs\@: - // Load 3 more vectors of data. - _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 - _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 - _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 - sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 - add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 - - // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next - // 4 vectors of data and write the result back to V0-V3. - cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 - jbe .Lreduce_4vecs_to_2\@ - _load_vec_folding_consts 2 -.Lfold_4vecs_loop\@: - _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - sub $-4*VL, BUF - add $-4*VL, LEN - cmp $4*VL-1, LEN - ja .Lfold_4vecs_loop\@ - - // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold - // two more vectors of data from BUF, if at least that much remains. -.Lreduce_4vecs_to_2\@: - _load_vec_folding_consts 1 - _fold_vec V0, V2, CONSTS, V4 - _fold_vec V1, V3, CONSTS, V4 - test $2*VL, LEN8 - jz .Lreduce_2vecs_to_1\@ - _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - sub $-2*VL, BUF - - // Fold V0 into V1 and write the result back to V0. Then fold one more - // vector of data from BUF, if at least that much remains. -.Lreduce_2vecs_to_1\@: - _load_vec_folding_consts 0 - _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 - -.Lreduce_1vec_to_128bits\@: -.if VL == 64 - // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of - // data from BUF, if at least that much remains. - vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM - vextracti64x4 $1, %zmm0, %ymm1 - _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 -.Lreduce_256bits_to_128bits\@: -.endif -.if VL >= 32 - // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of - // data from BUF, if at least that much remains. - vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM - vextracti128 $1, %ymm0, %xmm1 - _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 -.Lcheck_for_partial_block\@: -.endif - and $15, LEN32 - jz .Lreduce_128bits_to_crc\@ - - // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now - // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 - // and B is the polynomial of the remaining LEN data bytes. To reduce - // this to 128 bits without needing fold constants for each possible - // LEN, rearrange this expression into C1*(x^128) + C2, where - // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. - // Then fold C1 into C2, which is just another fold across 128 bits. - -.if !LSB_CRC || AVX_LEVEL == 0 - // Load the last 16 data bytes. Note that originally LEN was >= 16. - _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 -.endif // Else will use vpblendvb mem operand later. -.if !LSB_CRC - neg LEN // Needed for indexing shuf_table -.endif - - // tmp = A*x^(8*LEN) mod x^128 - // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] - // i.e. right-shift by LEN bytes. - // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] - // i.e. left-shift by LEN bytes. - _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 - _cond_vex pshufb, %xmm3, %xmm0, %xmm1 - - // C1 = floor(A / x^(128 - 8*LEN)) - // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] - // i.e. left-shift by 16-LEN bytes. - // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] - // i.e. right-shift by 16-LEN bytes. - _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ - %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 - - // C2 = tmp + B. This is just a blend of tmp with the last 16 data - // bytes (reflected if msb-first). The blend mask is the shuffle table - // that was used to create tmp. 0 selects tmp, and 1 last16databytes. -.if AVX_LEVEL == 0 - movdqa %xmm0, %xmm4 - movdqa %xmm3, %xmm0 - pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand - movdqa %xmm4, %xmm0 -.elseif LSB_CRC - vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 -.else - vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -.endif - - // Fold C1 into C2 and store the 128-bit result in %xmm0. - _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 - -.Lreduce_128bits_to_crc\@: - // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit - // polynomial stored in %xmm0 (using either lsb-first or msb-first bit - // order according to LSB_CRC), and G is the CRC's generator polynomial. - - // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: - // - // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + - // x^n * (%xmm0 mod x^64) - // - // Store t0 * x^(64-n) in %xmm0. I.e., actually do: - // - // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + - // x^64 * (%xmm0 mod x^64) - // - // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned - // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily - // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the - // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case - // (considering the extra factor of x that gets implicitly introduced by - // each pclmulqdq when using lsb-first order), is identical to the - // constant that was used earlier for folding the LO64_TERMS across 128 - // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 -.if LSB_CRC - _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) -.else - _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) -.endif - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). - // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). - - // First step of Barrett reduction: Compute floor(t0 / G). This is the - // polynomial by which G needs to be multiplied to cancel out the x^n - // and higher terms of t0, i.e. to reduce t0 mod G. First do: - // - // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) - // - // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in - // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest - // value that makes enough precision be carried through the calculation. - // - // The '* x' makes it so the result is floor(t1 / x^64) rather than - // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it - // can be extracted much more easily in the next step. In the lsb-first - // case the '* x' happens implicitly. In the msb-first case it must be - // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the - // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and - // the multiplication by the x^64 term is handled using a pxor. The - // pxor causes the low 64 terms of t1 to be wrong, but they are unused. - _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM - _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 -.if !LSB_CRC - _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) -.endif - // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). - - // Second step of Barrett reduction: Cancel out the x^n and higher terms - // of t0 by subtracting the needed multiple of G. This gives the CRC: - // - // crc := t0 - (G * floor(t0 / G)) - // - // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: - // - // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) - // - // Furthermore, since the resulting CRC is n-bit, if mod x^n is - // explicitly applied to it then the x^n term of G makes no difference - // in the result and can be omitted. This helps keep the constant - // multiplier in 64 bits in most cases. This gives the following: - // - // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) - // crc := (%xmm0 / x^(64-n)) mod x^n - // - // In the lsb-first case, each pclmulqdq implicitly introduces - // an extra factor of x, so in that case the constant that needs to be - // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. - // For lsb-first CRCs where n=64, the extra factor of x cannot be as - // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to - // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC - // polynomials have nonzero x^n and x^0 terms.) It works out as: the - // CRC has be XORed with the physically low qword of %xmm1, representing - // floor(t0 / G). The most efficient way to do that is to move it to - // the physically high qword and use a ternlog to combine the two XORs. -.if LSB_CRC && \n == 64 - _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 - .if AVX_LEVEL <= 2 - _cond_vex pxor, %xmm2, %xmm0, %xmm0 - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - .else - vpternlogq $0x96, %xmm2, %xmm1, %xmm0 - .endif - _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 -.else - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - .if \n == 8 - _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 - .elseif \n == 16 - _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 - .elseif \n == 32 - _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 - .else // \n == 64 && !LSB_CRC - _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 - .endif -.endif - -.if VL > 16 - vzeroupper // Needed when ymm or zmm registers may have been used. -.endif -#ifdef __i386__ - pop CONSTS_PTR -#endif - RET -.endm - -#ifdef CONFIG_AS_VPCLMULQDQ -#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ -SYM_FUNC_START(prefix##_pclmul_sse); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ -SYM_FUNC_END(prefix##_pclmul_sse); \ - \ -SYM_FUNC_START(prefix##_vpclmul_avx2); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ -SYM_FUNC_END(prefix##_vpclmul_avx2); \ - \ -SYM_FUNC_START(prefix##_vpclmul_avx512); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ -SYM_FUNC_END(prefix##_vpclmul_avx512); -#else -#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ -SYM_FUNC_START(prefix##_pclmul_sse); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ -SYM_FUNC_END(prefix##_pclmul_sse); -#endif // !CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h deleted file mode 100644 index c5b3bfe11d8d..000000000000 --- a/arch/x86/lib/crc-pclmul-template.h +++ /dev/null @@ -1,76 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are - * instantiated by crc-pclmul-template.S - * - * Copyright 2025 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ -#ifndef _CRC_PCLMUL_TEMPLATE_H -#define _CRC_PCLMUL_TEMPLATE_H - -#include <asm/cpufeatures.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> -#include <linux/static_call.h> -#include "crc-pclmul-consts.h" - -#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ -crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) - -#define INIT_CRC_PCLMUL(prefix) \ -do { \ - if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \ - boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \ - boot_cpu_has(X86_FEATURE_AVX2) && \ - cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \ - if (boot_cpu_has(X86_FEATURE_AVX512BW) && \ - boot_cpu_has(X86_FEATURE_AVX512VL) && \ - !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \ - cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \ - static_call_update(prefix##_pclmul, \ - prefix##_vpclmul_avx512); \ - } else { \ - static_call_update(prefix##_pclmul, \ - prefix##_vpclmul_avx2); \ - } \ - } \ -} while (0) - -/* - * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 - * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. - * - * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. - * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), - * varying by CPU and factors such as which parts of the "FPU" state userspace - * has touched, which could result in a larger cutoff being better. Indeed, a - * larger cutoff is usually better for a *single* message. However, the - * overhead of the FPU section gets amortized if multiple FPU sections get - * executed before returning to userspace, since the XSAVE and XRSTOR occur only - * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on - * the dcache than the table-based code is, a 16-byte cutoff seems to work well. - */ -#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ -do { \ - if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ - crypto_simd_usable()) { \ - const void *consts_ptr; \ - \ - consts_ptr = (consts).fold_across_128_bits_consts; \ - kernel_fpu_begin(); \ - crc = static_call(prefix##_pclmul)((crc), (p), (len), \ - consts_ptr); \ - kernel_fpu_end(); \ - return crc; \ - } \ -} while (0) - -#endif /* _CRC_PCLMUL_TEMPLATE_H */ diff --git a/arch/x86/lib/crc-t10dif.c b/arch/x86/lib/crc-t10dif.c deleted file mode 100644 index db7ce59c31ac..000000000000 --- a/arch/x86/lib/crc-t10dif.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC-T10DIF using [V]PCLMULQDQ instructions - * - * Copyright 2024 Google LLC - */ - -#include <linux/crc-t10dif.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); - -u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, - have_pclmulqdq); - return crc_t10dif_generic(crc, p, len); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc16_msb); - } - return 0; -} -subsys_initcall(crc_t10dif_x86_init); - -static void __exit crc_t10dif_x86_exit(void) -{ -} -module_exit(crc_t10dif_x86_exit); - -MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S deleted file mode 100644 index e9fe248093a8..000000000000 --- a/arch/x86/lib/crc16-msb-pclmul.S +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0) diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S deleted file mode 100644 index f20f40fb0172..000000000000 --- a/arch/x86/lib/crc32-pclmul.S +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) diff --git a/arch/x86/lib/crc32.c b/arch/x86/lib/crc32.c deleted file mode 100644 index d09343e2cea9..000000000000 --- a/arch/x86/lib/crc32.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * x86-optimized CRC32 functions - * - * Copyright (C) 2008 Intel Corporation - * Copyright 2012 Xyratex Technology Limited - * Copyright 2024 Google LLC - */ - -#include <linux/crc32.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, - have_pclmulqdq); - return crc32_le_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -#ifdef CONFIG_X86_64 -#define CRC32_INST "crc32q %1, %q0" -#else -#define CRC32_INST "crc32l %1, %0" -#endif - -/* - * Use carryless multiply version of crc32c when buffer size is >= 512 to - * account for FPU state save/restore overhead. - */ -#define CRC32C_PCLMUL_BREAKEVEN 512 - -asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - size_t num_longs; - - if (!static_branch_likely(&have_crc32)) - return crc32c_base(crc, p, len); - - if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { - kernel_fpu_begin(); - crc = crc32c_x86_3way(crc, p, len); - kernel_fpu_end(); - return crc; - } - - for (num_longs = len / sizeof(unsigned long); - num_longs != 0; num_longs--, p += sizeof(unsigned long)) - asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); - - if (sizeof(unsigned long) > 4 && (len & 4)) { - asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); - p += 4; - } - if (len & 2) { - asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); - p += 2; - } - if (len & 1) - asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); - - return crc; -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_XMM4_2)) - static_branch_enable(&have_crc32); - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc32_lsb); - } - return 0; -} -subsys_initcall(crc32_x86_init); - -static void __exit crc32_x86_exit(void) -{ -} -module_exit(crc32_x86_exit); - -u32 crc32_optimizations(void) -{ - u32 optimizations = 0; - - if (static_key_enabled(&have_crc32)) - optimizations |= CRC32C_OPTIMIZATION; - if (static_key_enabled(&have_pclmulqdq)) - optimizations |= CRC32_LE_OPTIMIZATION; - return optimizations; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_DESCRIPTION("x86-optimized CRC32 functions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S deleted file mode 100644 index 9b8770503bbc..000000000000 --- a/arch/x86/lib/crc32c-3way.S +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) - * - * The white papers on CRC32C calculations with PCLMULQDQ instruction can be - * downloaded from: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. - * Copyright 2024 Google LLC - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * James Guilford <james.guilford@intel.com> - * David Cote <david.m.cote@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/linkage.h> - -## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - -# Define threshold below which buffers are considered "small" and routed to -# regular CRC code that does not interleave the CRC instructions. -#define SMALL_SIZE 200 - -# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); - -.text -SYM_FUNC_START(crc32c_x86_3way) -#define crc0 %edi -#define crc0_q %rdi -#define bufp %rsi -#define bufp_d %esi -#define len %rdx -#define len_dw %edx -#define n_misaligned %ecx /* overlaps chunk_bytes! */ -#define n_misaligned_q %rcx -#define chunk_bytes %ecx /* overlaps n_misaligned! */ -#define chunk_bytes_q %rcx -#define crc1 %r8 -#define crc2 %r9 - - cmp $SMALL_SIZE, len - jb .Lsmall - - ################################################################ - ## 1) ALIGN: - ################################################################ - mov bufp_d, n_misaligned - neg n_misaligned - and $7, n_misaligned # calculate the misalignment amount of - # the address - je .Laligned # Skip if aligned - - # Process 1 <= n_misaligned <= 7 bytes individually in order to align - # the remaining data to an 8-byte boundary. -.Ldo_align: - movq (bufp), %rax - add n_misaligned_q, bufp - sub n_misaligned_q, len -.Lalign_loop: - crc32b %al, crc0 # compute crc32 of 1-byte - shr $8, %rax # get next byte - dec n_misaligned - jne .Lalign_loop -.Laligned: - - ################################################################ - ## 2) PROCESS BLOCK: - ################################################################ - - cmp $128*24, len - jae .Lfull_block - -.Lpartial_block: - # Compute floor(len / 24) to get num qwords to process from each lane. - imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) - shr $16, %eax - jmp .Lcrc_3lanes - -.Lfull_block: - # Processing 128 qwords from each lane. - mov $128, %eax - - ################################################################ - ## 3) CRC each of three lanes: - ################################################################ - -.Lcrc_3lanes: - xor crc1,crc1 - xor crc2,crc2 - mov %eax, chunk_bytes - shl $3, chunk_bytes # num bytes to process from each lane - sub $5, %eax # 4 for 4x_loop, 1 for special last iter - jl .Lcrc_3lanes_4x_done - - # Unroll the loop by a factor of 4 to reduce the overhead of the loop - # bookkeeping instructions, which can compete with crc32q for the ALUs. -.Lcrc_3lanes_4x_loop: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 - crc32q (bufp,chunk_bytes_q,2), crc2 - crc32q 8(bufp), crc0_q - crc32q 8(bufp,chunk_bytes_q), crc1 - crc32q 8(bufp,chunk_bytes_q,2), crc2 - crc32q 16(bufp), crc0_q - crc32q 16(bufp,chunk_bytes_q), crc1 - crc32q 16(bufp,chunk_bytes_q,2), crc2 - crc32q 24(bufp), crc0_q - crc32q 24(bufp,chunk_bytes_q), crc1 - crc32q 24(bufp,chunk_bytes_q,2), crc2 - add $32, bufp - sub $4, %eax - jge .Lcrc_3lanes_4x_loop - -.Lcrc_3lanes_4x_done: - add $4, %eax - jz .Lcrc_3lanes_last_qword - -.Lcrc_3lanes_1x_loop: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 - crc32q (bufp,chunk_bytes_q,2), crc2 - add $8, bufp - dec %eax - jnz .Lcrc_3lanes_1x_loop - -.Lcrc_3lanes_last_qword: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 -# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet - - ################################################################ - ## 4) Combine three results: - ################################################################ - - lea (K_table-8)(%rip), %rax # first entry is for idx 1 - pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 - lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 - sub %rax, len # len -= chunk_bytes * 3 - - movq crc0_q, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 - - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor (bufp,chunk_bytes_q,2), %rax - mov crc2, crc0_q - crc32 %rax, crc0_q - lea 8(bufp,chunk_bytes_q,2), bufp - - ################################################################ - ## 5) If more blocks remain, goto (2): - ################################################################ - - cmp $128*24, len - jae .Lfull_block - cmp $SMALL_SIZE, len - jae .Lpartial_block - - ####################################################################### - ## 6) Process any remainder without interleaving: - ####################################################################### -.Lsmall: - test len_dw, len_dw - jz .Ldone - mov len_dw, %eax - shr $3, %eax - jz .Ldo_dword -.Ldo_qwords: - crc32q (bufp), crc0_q - add $8, bufp - dec %eax - jnz .Ldo_qwords -.Ldo_dword: - test $4, len_dw - jz .Ldo_word - crc32l (bufp), crc0 - add $4, bufp -.Ldo_word: - test $2, len_dw - jz .Ldo_byte - crc32w (bufp), crc0 - add $2, bufp -.Ldo_byte: - test $1, len_dw - jz .Ldone - crc32b (bufp), crc0 -.Ldone: - mov crc0, %eax - RET -SYM_FUNC_END(crc32c_x86_3way) - -.section .rodata, "a", @progbits - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each - ################################################################ -.align 8 -K_table: - .long 0x493c7d27, 0x00000001 - .long 0xba4fc28e, 0x493c7d27 - .long 0xddc0152b, 0xf20c0dfe - .long 0x9e4addf8, 0xba4fc28e - .long 0x39d3b296, 0x3da6d0cb - .long 0x0715ce53, 0xddc0152b - .long 0x47db8317, 0x1c291d04 - .long 0x0d3b6092, 0x9e4addf8 - .long 0xc96cfdc0, 0x740eef02 - .long 0x878a92a7, 0x39d3b296 - .long 0xdaece73e, 0x083a6eec - .long 0xab7aff2a, 0x0715ce53 - .long 0x2162d385, 0xc49f4f67 - .long 0x83348832, 0x47db8317 - .long 0x299847d5, 0x2ad91c30 - .long 0xb9e02b86, 0x0d3b6092 - .long 0x18b33a4e, 0x6992cea2 - .long 0xb6dd949b, 0xc96cfdc0 - .long 0x78d9ccb7, 0x7e908048 - .long 0xbac2fd7b, 0x878a92a7 - .long 0xa60ce07b, 0x1b3d8f29 - .long 0xce7f39f4, 0xdaece73e - .long 0x61d82e56, 0xf1d0f55e - .long 0xd270f1a2, 0xab7aff2a - .long 0xc619809d, 0xa87ab8a8 - .long 0x2b3cac5d, 0x2162d385 - .long 0x65863b64, 0x8462d800 - .long 0x1b03397f, 0x83348832 - .long 0xebb883bd, 0x71d111a8 - .long 0xb3e32c28, 0x299847d5 - .long 0x064f7f26, 0xffd852c6 - .long 0xdd7e3b0c, 0xb9e02b86 - .long 0xf285651c, 0xdcb17aa4 - .long 0x10746f3c, 0x18b33a4e - .long 0xc7a68855, 0xf37c5aee - .long 0x271d9844, 0xb6dd949b - .long 0x8e766a0c, 0x6051d5a2 - .long 0x93a5f730, 0x78d9ccb7 - .long 0x6cb08e5c, 0x18b0d4ff - .long 0x6b749fb2, 0xbac2fd7b - .long 0x1393e203, 0x21f3d99c - .long 0xcec3662e, 0xa60ce07b - .long 0x96c515bb, 0x8f158014 - .long 0xe6fc4e6a, 0xce7f39f4 - .long 0x8227bb8a, 0xa00457f7 - .long 0xb0cd4768, 0x61d82e56 - .long 0x39c7ff35, 0x8d6d2c43 - .long 0xd7a4825c, 0xd270f1a2 - .long 0x0ab3844b, 0x00ac29cf - .long 0x0167d312, 0xc619809d - .long 0xf6076544, 0xe9adf796 - .long 0x26f6a60a, 0x2b3cac5d - .long 0xa741c1bf, 0x96638b34 - .long 0x98d8d9cb, 0x65863b64 - .long 0x49c3cc9c, 0xe0e9f351 - .long 0x68bce87a, 0x1b03397f - .long 0x57a3d037, 0x9af01f2d - .long 0x6956fc3b, 0xebb883bd - .long 0x42d98888, 0x2cff42cf - .long 0x3771e98f, 0xb3e32c28 - .long 0xb42ae3d9, 0x88f25a3a - .long 0x2178513a, 0x064f7f26 - .long 0xe0ac139e, 0x4e36f0b0 - .long 0x170076fa, 0xdd7e3b0c - .long 0x444dd413, 0xbd6f81f8 - .long 0x6f345e45, 0xf285651c - .long 0x41d17b64, 0x91c9bd4b - .long 0xff0dba97, 0x10746f3c - .long 0xa2b73df1, 0x885f087b - .long 0xf872e54c, 0xc7a68855 - .long 0x1e41e9fc, 0x4c144932 - .long 0x86d8e4d2, 0x271d9844 - .long 0x651bd98b, 0x52148f02 - .long 0x5bb8f1bc, 0x8e766a0c - .long 0xa90fd27a, 0xa3c6f37a - .long 0xb3af077a, 0x93a5f730 - .long 0x4984d782, 0xd7c0557f - .long 0xca6ef3ac, 0x6cb08e5c - .long 0x234e0b26, 0x63ded06a - .long 0xdd66cbbb, 0x6b749fb2 - .long 0x4597456a, 0x4d56973c - .long 0xe9e28eb4, 0x1393e203 - .long 0x7b3ff57a, 0x9669c9df - .long 0xc9c8b782, 0xcec3662e - .long 0x3f70cc6f, 0xe417f38a - .long 0x93e106a4, 0x96c515bb - .long 0x62ec6c6d, 0x4b9e0f71 - .long 0xd813b325, 0xe6fc4e6a - .long 0x0df04680, 0xd104b8fc - .long 0x2342001e, 0x8227bb8a - .long 0x0a2a8d7e, 0x5b397730 - .long 0x6d9a4957, 0xb0cd4768 - .long 0xe8b6368b, 0xe78eb416 - .long 0xd2c3ed1a, 0x39c7ff35 - .long 0x995a5724, 0x61ff0e01 - .long 0x9ef68d35, 0xd7a4825c - .long 0x0c139b31, 0x8d96551c - .long 0xf2271e60, 0x0ab3844b - .long 0x0b0bf8ca, 0x0bf80dd2 - .long 0x2664fd8b, 0x0167d312 - .long 0xed64812d, 0x8821abed - .long 0x02ee03b2, 0xf6076544 - .long 0x8604ae0f, 0x6a45d2b2 - .long 0x363bd6b3, 0x26f6a60a - .long 0x135c83fd, 0xd8d26619 - .long 0x5fabe670, 0xa741c1bf - .long 0x35ec3279, 0xde87806c - .long 0x00bcf5f6, 0x98d8d9cb - .long 0x8ae00689, 0x14338754 - .long 0x17f27698, 0x49c3cc9c - .long 0x58ca5f00, 0x5bd2011f - .long 0xaa7c7ad5, 0x68bce87a - .long 0xb5cfca28, 0xdd07448e - .long 0xded288f8, 0x57a3d037 - .long 0x59f229bc, 0xdde8f5b9 - .long 0x6d390dec, 0x6956fc3b - .long 0x37170390, 0xa3e3e02c - .long 0x6353c1cc, 0x42d98888 - .long 0xc4584f5c, 0xd73c7bea - .long 0xf48642e9, 0x3771e98f - .long 0x531377e2, 0x80ff0093 - .long 0xdd35bc8d, 0xb42ae3d9 - .long 0xb25b29f2, 0x8fe4c34d - .long 0x9a5ede41, 0x2178513a - .long 0xa563905d, 0xdf99fc11 - .long 0x45cddf4e, 0xe0ac139e - .long 0xacfa3103, 0x6c23e841 - .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S deleted file mode 100644 index 4173051b5197..000000000000 --- a/arch/x86/lib/crc64-pclmul.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0) -DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1) diff --git a/arch/x86/lib/crc64.c b/arch/x86/lib/crc64.c deleted file mode 100644 index 351a09f5813e..000000000000 --- a/arch/x86/lib/crc64.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC64 using [V]PCLMULQDQ instructions - * - * Copyright 2025 Google LLC - */ - -#include <linux/crc64.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); -DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); - -u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, - have_pclmulqdq); - return crc64_be_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_be_arch); - -u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, - have_pclmulqdq); - return crc64_nvme_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_nvme_arch); - -static int __init crc64_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc64_msb); - INIT_CRC_PCLMUL(crc64_lsb); - } - return 0; -} -subsys_initcall(crc64_x86_init); - -static void __exit crc64_x86_exit(void) -{ -} -module_exit(crc64_x86_exit); - -MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore deleted file mode 100644 index 580c839bb177..000000000000 --- a/arch/x86/lib/crypto/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-x86_64-cryptogams.S diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig deleted file mode 100644 index 5e94cdee492c..000000000000 --- a/arch/x86/lib/crypto/Kconfig +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile deleted file mode 100644 index abceca3d31c0..000000000000 --- a/arch/x86/lib/crypto/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o -sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S deleted file mode 100644 index ac1c845445a4..000000000000 --- a/arch/x86/lib/crypto/blake2s-core.S +++ /dev/null @@ -1,252 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 -.align 32 -IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 - .octa 0x5BE0CD191F83D9AB9B05688C510E527F -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 -.section .rodata.cst16.ROR328, "aM", @progbits, 16 -.align 16 -ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 -.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 -.align 64 -SIGMA: -.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 -.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 -.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 -.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 -.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 -.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 -.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 -.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 -.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 -.align 64 -SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 - -.text -SYM_FUNC_START(blake2s_compress_ssse3) - testq %rdx,%rdx - je .Lendofloop - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqa ROT16(%rip),%xmm12 - movdqa ROR328(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movq %rcx,%xmm15 - leaq SIGMA+0xa0(%rip),%r8 - jmp .Lbeginofloop - .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa IV(%rip),%xmm2 - movdqa %xmm14,%xmm3 - pxor IV+0x10(%rip),%xmm3 - leaq SIGMA(%rip),%rcx -.Lroundloop: - movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - punpckldq %xmm5,%xmm4 - punpckldq %xmm7,%xmm6 - punpcklqdq %xmm6,%xmm4 - paddd %xmm4,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - punpckldq %xmm6,%xmm5 - punpckldq %xmm4,%xmm7 - punpcklqdq %xmm7,%xmm5 - paddd %xmm5,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x93,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - punpckldq %xmm7,%xmm6 - punpckldq %xmm5,%xmm4 - punpcklqdq %xmm4,%xmm6 - paddd %xmm6,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - punpckldq %xmm4,%xmm7 - punpckldq %xmm6,%xmm5 - punpcklqdq %xmm5,%xmm7 - paddd %xmm7,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x39,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx - cmpq %r8,%rcx - jnz .Lroundloop - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm1 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) -.Lendofloop: - RET -SYM_FUNC_END(blake2s_compress_ssse3) - -SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovq %rcx,%xmm5 - vmovdqa IV(%rip),%xmm14 - vmovdqa IV+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi - leaq SIGMA2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 - vpermi2d %ymm7,%ymm6,%ymm8 - vpermi2d %ymm7,%ymm6,%ymm9 - vmovdqa %ymm8,%ymm6 - vmovdqa %ymm9,%ymm7 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x39,%xmm2,%xmm2 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x93,%xmm2,%xmm2 - decb %cl - jne .Lblake2s_compress_avx512_roundloop - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) - vzeroupper - RET -SYM_FUNC_END(blake2s_compress_avx512) diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c deleted file mode 100644 index adc296cd17c9..000000000000 --- a/arch/x86/lib/crypto/blake2s-glue.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/processor.h> -#include <asm/simd.h> -#include <crypto/internal/blake2s.h> -#include <linux/init.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/sizes.h> - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S deleted file mode 100644 index f3d8fc018249..000000000000 --- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S +++ /dev/null @@ -1,1021 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx2) - -SYM_FUNC_START(chacha_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx2) - -SYM_FUNC_START(chacha_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S deleted file mode 100644 index 259383e1ad44..000000000000 --- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S +++ /dev/null @@ -1,836 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone2 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx512vl) - -SYM_FUNC_START(chacha_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone4 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx512vl) - -SYM_FUNC_START(chacha_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S deleted file mode 100644 index 7111949cd5b9..000000000000 --- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S +++ /dev/null @@ -1,791 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * The round count is given in %r8d. - * - * Clobbers: %r8d, %xmm4-%xmm7 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - sub $2,%r8d - jnz .Ldoubleround - - RET -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - FRAME_BEGIN - - # x0..3 = s0..3 - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - RET - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -SYM_FUNC_END(chacha_block_xor_ssse3) - -SYM_FUNC_START(hchacha_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - # %edx: nrounds - FRAME_BEGIN - - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - - mov %edx,%r8d - call chacha_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - RET -SYM_FUNC_END(hchacha_block_ssse3) - -SYM_FUNC_START(chacha_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four consecutive ChaCha blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - sub $2,%r8d - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c deleted file mode 100644 index 10b2c945f541..000000000000 --- a/arch/x86/lib/crypto/chacha_glue.c +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha and HChaCha functions (x86_64 optimized) - * - * Copyright (C) 2015 Martin Willi - */ - -#include <asm/simd.h> -#include <crypto/chacha.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> - -asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state->x[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd)) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, out, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl deleted file mode 100644 index 501827254fed..000000000000 --- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl +++ /dev/null @@ -1,4253 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Initial release. -# -# December 2016 -# -# Add AVX512F+VL+BW code path. -# -# November 2017 -# -# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be -# executed even on Knights Landing. Trigger for modification was -# observation that AVX512 code paths can negatively affect overall -# Skylake-X system performance. Since we are likely to suppress -# AVX512F capability flag [at least on Skylake-X], conversion serves -# as kind of "investment protection". Note that next *lake processor, -# Cannonlake, has AVX512IFMA code path to execute... -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] -# Silvermont 2.83/+95% - -# Knights L 3.60/? 1.65 1.10 0.41(***) -# Goldmont 1.70/+180% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# Ryzen 1.15/+200% 1.08 1.18 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; -# (***) strangely enough performance seems to vary from core to core, -# listed result is best case; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -$code.=<<___ if $kernel; -#include <linux/cfi_types.h> -___ - -if ($avx) { -$code.=<<___ if $kernel; -.section .rodata -___ -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ -} -$code.=<<___ if (!$kernel); -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 16 -___ - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); -my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text -___ -$code.=<<___ if (!$kernel); -.extern OPENSSL_ia32cap_P - -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch -.globl poly1305_blocks_x86_64 -.hidden poly1305_blocks_x86_64 -.globl poly1305_emit_x86_64 -.hidden poly1305_emit_x86_64 -___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); -$code.=<<___; - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - test $inp,$inp - je .Lno_key -___ -$code.=<<___ if (!$kernel); - lea poly1305_blocks_x86_64(%rip),%r10 - lea poly1305_emit_x86_64(%rip),%r11 -___ -$code.=<<___ if (!$kernel && $avx); - mov OPENSSL_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if (!$kernel && $avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___ if (!$kernel && $avx>3); - mov \$`(1<<31|1<<21|1<<16)`,%rax - shr \$32,%r9 - and %rax,%r9 - cmp %rax,%r9 - je .Linit_base2_44 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if (!$kernel && $flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if (!$kernel && $flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - RET -___ -&end_function("poly1305_block_init_arch"); - -&declare_function("poly1305_blocks_x86_64", 32, 4); -$code.=<<___; -.cfi_startproc -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $ctx -.cfi_push $ctx -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - - &poly1305_iteration(); - -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov 0(%rsp),$ctx -.cfi_restore $ctx - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data: -.Lblocks_epilogue: - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_x86_64"); - -&declare_function("poly1305_emit_x86_64", 32, 3); -$code.=<<___; -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_x86_64"); -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: - push $ctx -___ - &poly1305_iteration(); -$code.=<<___; - pop $ctx - RET -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - push %rbp - mov %rsp,%rbp - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - pop %rbp - RET -.size __poly1305_init_avx,.-__poly1305_init_avx -___ - -&declare_function("poly1305_blocks_avx", 32, 4); -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx: -.Lblocks_avx_epilogue: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - and \$-32,%rsp - sub \$-8,%rsp - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_avx"); - -&declare_function("poly1305_emit_avx", 32, 3); -$code.=<<___; - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_avx"); - -if ($avx>1) { - -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -sub poly1305_blocks_avxN { - my ($avx512) = @_; - my $suffix = $avx512 ? "_avx512" : ""; -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2$suffix - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2$suffix: - and \$-16,$len - jz .Lno_data_avx2$suffix - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2$suffix - - test \$63,$len - jz .Leven_avx2$suffix - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2$suffix - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2$suffix - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2$suffix - -.align 32 -.Lstore_base2_64_avx2$suffix: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2$suffix - -.align 16 -.Lstore_base2_26_avx2$suffix: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2$suffix: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx2$suffix: -.Lblocks_avx2_epilogue$suffix: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx2$suffix: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2$suffix - -.Lbase2_64_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2$suffix - -.Linit_avx2$suffix: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2$suffix: - mov %r15,$len # restore $len -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d - mov \$`(1<<31|1<<30|1<<16)`,%r11d -___ -$code.=<<___; - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx2_epilogue$suffix: - jmp .Ldo_avx2$suffix -.cfi_endproc - -.align 32 -.Leven_avx2$suffix: -.cfi_startproc -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d -___ -$code.=<<___; - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2$suffix: -___ -$code.=<<___ if (!$kernel && $avx>2); - cmp \$512,$len - jb .Lskip_avx512 - and %r11d,%r9d - test \$`1<<16`,%r9d # check for AVX512F - jnz .Lblocks_avx512 -.Lskip_avx512$suffix: -___ -$code.=<<___ if ($avx > 2 && $avx512 && $kernel); - cmp \$512,$len - jae .Lblocks_avx512 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx2_body$suffix: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - lea 0x90(%rsp),%rax # size optimization - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermd $T2,$T0,$T2 # 00003412 -> 14243444 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermd $T3,$T0,$T3 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermd $T4,$T0,$T4 - vmovdqa $T2,0x00(%rsp) - vpermd $D0,$T0,$D0 - vmovdqa $T3,0x20-0x90(%rax) - vpermd $D1,$T0,$D1 - vmovdqa $T4,0x40-0x90(%rax) - vpermd $D2,$T0,$D2 - vmovdqa $D0,0x60-0x90(%rax) - vpermd $D3,$T0,$D3 - vmovdqa $D1,0x80-0x90(%rax) - vpermd $D4,$T0,$D4 - vmovdqa $D2,0xa0-0x90(%rax) - vpermd $MASK,$T0,$MASK - vmovdqa $D3,0xc0-0x90(%rax) - vmovdqa $D4,0xe0-0x90(%rax) - vmovdqa $MASK,0x100-0x90(%rax) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2$suffix - jmp .Loop_avx2$suffix - -.align 32 -.Loop_avx2$suffix: - ################################################################ - # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 - # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 - # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 - # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 - # \________/\__________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2$suffix - - .byte 0x66,0x90 -.Ltail_avx2$suffix: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa -0xb0(%r10),%xmm6 - vmovdqa -0xa0(%r10),%xmm7 - vmovdqa -0x90(%r10),%xmm8 - vmovdqa -0x80(%r10),%xmm9 - vmovdqa -0x70(%r10),%xmm10 - vmovdqa -0x60(%r10),%xmm11 - vmovdqa -0x50(%r10),%xmm12 - vmovdqa -0x40(%r10),%xmm13 - vmovdqa -0x30(%r10),%xmm14 - vmovdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx2_epilogue$suffix: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -if($avx > 2 && $avx512) { -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); -my $PADBIT="%zmm30"; - -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); - -$code.=<<___; -.cfi_startproc -.Lblocks_avx512: - mov \$15,%eax - kmovw %eax,%k2 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx512_body: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 - - # expand pre-calculated table - vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} - mov \$0x20,%rax - vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} - vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} - vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} - vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} - vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} - vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} - vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} - vpermd $D0,$T2,$R0 # 00003412 -> 14243444 - vpbroadcastq 64(%rcx),$MASK # .Lmask26 - vpermd $D1,$T2,$R1 - vpermd $T0,$T2,$S1 - vpermd $D2,$T2,$R2 - vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 - vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $T1,$T2,$S2 - vmovdqu64 $R1,0x00(%rsp,%rax){%k2} - vpsrlq \$32,$R1,$T1 - vpermd $D3,$T2,$R3 - vmovdqa64 $S1,0x40(%rsp){%k2} - vpermd $T3,$T2,$S3 - vpermd $D4,$T2,$R4 - vmovdqu64 $R2,0x40(%rsp,%rax){%k2} - vpermd $T4,$T2,$S4 - vmovdqa64 $S2,0x80(%rsp){%k2} - vmovdqu64 $R3,0x80(%rsp,%rax){%k2} - vmovdqa64 $S3,0xc0(%rsp){%k2} - vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} - vmovdqa64 $S4,0x100(%rsp){%k2} - - ################################################################ - # calculate 5th through 8th powers of the key - # - # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 - # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 - # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 - # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 - # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 - - vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 - vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 - vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 - vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 - vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 - vpsrlq \$32,$R2,$T2 - - vpmuludq $T1,$S4,$M0 - vpmuludq $T1,$R0,$M1 - vpmuludq $T1,$R1,$M2 - vpmuludq $T1,$R2,$M3 - vpmuludq $T1,$R3,$M4 - vpsrlq \$32,$R3,$T3 - vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 - vpaddq $M1,$D1,$D1 # d1 += r1'*r0 - vpaddq $M2,$D2,$D2 # d2 += r1'*r1 - vpaddq $M3,$D3,$D3 # d3 += r1'*r2 - vpaddq $M4,$D4,$D4 # d4 += r1'*r3 - - vpmuludq $T2,$S3,$M0 - vpmuludq $T2,$S4,$M1 - vpmuludq $T2,$R1,$M3 - vpmuludq $T2,$R2,$M4 - vpmuludq $T2,$R0,$M2 - vpsrlq \$32,$R4,$T4 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 - vpaddq $M3,$D3,$D3 # d3 += r2'*r1 - vpaddq $M4,$D4,$D4 # d4 += r2'*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*r0 - - vpmuludq $T3,$S2,$M0 - vpmuludq $T3,$R0,$M3 - vpmuludq $T3,$R1,$M4 - vpmuludq $T3,$S3,$M1 - vpmuludq $T3,$S4,$M2 - vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 - vpaddq $M3,$D3,$D3 # d3 += r3'*r0 - vpaddq $M4,$D4,$D4 # d4 += r3'*r1 - vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 - vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 - - vpmuludq $T4,$S4,$M3 - vpmuludq $T4,$R0,$M4 - vpmuludq $T4,$S1,$M0 - vpmuludq $T4,$S2,$M1 - vpmuludq $T4,$S3,$M2 - vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 - vpaddq $M4,$D4,$D4 # d4 += r2'*r0 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 - - ################################################################ - # load input - vmovdqu64 16*0($inp),%z#$T3 - vmovdqu64 16*4($inp),%z#$T4 - lea 16*8($inp),$inp - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D4,$M4 - vpandq $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$M1 - vpandq $MASK,$D1,$D1 - vpaddq $M1,$D2,$D2 # d1 -> d2 - - vpaddq $M4,$D0,$D0 - vpsllq \$2,$M4,$M4 - vpaddq $M4,$D0,$D0 # d4 -> d0 - - vpsrlq \$26,$D2,$M2 - vpandq $MASK,$D2,$D2 - vpaddq $M2,$D3,$D3 # d2 -> d3 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - ################################################################ - # at this point we have 14243444 in $R0-$S4 and 05060708 in - # $D0-$D4, ... - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - # ... since input 64-bit lanes are ordered as 73625140, we could - # "vperm" it to 76543210 (here and in each loop iteration), *or* - # we could just flow along, hence the goal for $R0-$S4 is - # 1858286838784888 ... - - vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: - mov \$0x7777,%eax - kmovw %eax,%k1 - - vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- - vpermd $R1,$M0,$R1 - vpermd $R2,$M0,$R2 - vpermd $R3,$M0,$R3 - vpermd $R4,$M0,$R4 - - vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 - vpermd $D1,$M0,${R1}{%k1} - vpermd $D2,$M0,${R2}{%k1} - vpermd $D3,$M0,${R3}{%k1} - vpermd $D4,$M0,${R4}{%k1} - - vpslld \$2,$R1,$S1 # *5 - vpslld \$2,$R2,$S2 - vpslld \$2,$R3,$S3 - vpslld \$2,$R4,$S4 - vpaddd $R1,$S1,$S1 - vpaddd $R2,$S2,$S2 - vpaddd $R3,$S3,$S3 - vpaddd $R4,$S4,$S4 - - vpbroadcastq 32(%rcx),$PADBIT # .L129 - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - vporq $T3,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$14,$T4,$T3 - vpsrlq \$40,$T4,$T4 # 4 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$192,$len - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - ################################################################ - # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 - # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 - # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 - # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 - # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 - # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 - # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 - # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 - # \________/\___________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 - # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 - # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 - # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpaddq $H0,$T0,$H0 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu64 16*0($inp),$T3 # load input - vmovdqu64 16*4($inp),$T4 - lea 16*8($inp),$inp - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpaddq $M3,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$S4,$M2 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - - vpsrlq \$26,$D3,$H3 - vpandq $MASK,$D3,$D3 - vpaddq $H3,$D4,$H4 # h3 -> h4 - - vporq $T3,$T2,$T2 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpandq $MASK,$T2,$T2 # 2 - - vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpaddq $D2,$D3,$H3 # h2 -> h3 - - vpsrlq \$14,$T4,$T3 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - sub \$128,$len - ja .Loop_avx512 - -.Ltail_avx512: - ################################################################ - # while above multiplications were by r^8 in all lanes, in last - # iteration we multiply least significant lane by r^8 and most - # significant one by r, that's why table gets shifted... - - vpsrlq \$32,$R0,$R0 # 0105020603070408 - vpsrlq \$32,$R1,$R1 - vpsrlq \$32,$R2,$R2 - vpsrlq \$32,$S3,$S3 - vpsrlq \$32,$S4,$S4 - vpsrlq \$32,$R3,$R3 - vpsrlq \$32,$R4,$R4 - vpsrlq \$32,$S1,$S1 - vpsrlq \$32,$S2,$S2 - - ################################################################ - # load either next or last 64 byte of input - lea ($inp,$len),$inp - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu 16*0($inp),%x#$T0 - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vmovdqu 16*1($inp),%x#$T1 - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpmuludq $H3,$S4,$M2 - vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # horizontal addition - - mov \$1,%eax - vpermq \$0xb1,$H3,$D3 - vpermq \$0xb1,$D4,$H4 - vpermq \$0xb1,$H0,$D0 - vpermq \$0xb1,$H1,$D1 - vpermq \$0xb1,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - kmovw %eax,%k3 - vpermq \$0x2,$H3,$D3 - vpermq \$0x2,$H4,$D4 - vpermq \$0x2,$H0,$D0 - vpermq \$0x2,$H1,$D1 - vpermq \$0x2,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - vextracti64x4 \$0x1,$H3,%y#$D3 - vextracti64x4 \$0x1,$H4,%y#$D4 - vextracti64x4 \$0x1,$H0,%y#$D0 - vextracti64x4 \$0x1,$H1,%y#$D1 - vextracti64x4 \$0x1,$H2,%y#$D2 - vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case - vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 - vpaddq $D0,$H0,${H0}{%k3}{z} - vpaddq $D1,$H1,${H1}{%k3}{z} - vpaddq $D2,$H2,${H2}{%k3}{z} -___ -map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); -map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); -$code.=<<___; - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpand $MASK,$T1,$T1 # 1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - vpaddq $D3,$H4,$H4 # h3 -> h4 - - lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 - add \$64,$len - jnz .Ltail_avx2$suffix - - vpsubq $T2,$H2,$H2 # undo input accumulation - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) - vzeroall -___ -$code.=<<___ if ($win64); - movdqa -0xb0(%r10),%xmm6 - movdqa -0xa0(%r10),%xmm7 - movdqa -0x90(%r10),%xmm8 - movdqa -0x80(%r10),%xmm9 - movdqa -0x70(%r10),%xmm10 - movdqa -0x60(%r10),%xmm11 - movdqa -0x50(%r10),%xmm12 - movdqa -0x40(%r10),%xmm13 - movdqa -0x30(%r10),%xmm14 - movdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx512_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - RET -.cfi_endproc -___ - -} - -} - -&declare_function("poly1305_blocks_avx2", 32, 4); -poly1305_blocks_avxN(0); -&end_function("poly1305_blocks_avx2"); - -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - -&declare_function("poly1305_blocks_avx512", 32, 4); -poly1305_blocks_avxN(1); -&end_function("poly1305_blocks_avx512"); - -if (!$kernel && $avx>3) { -######################################################################## -# VPMADD52 version using 2^44 radix. -# -# One can argue that base 2^52 would be more natural. Well, even though -# some operations would be more natural, one has to recognize couple of -# things. Base 2^52 doesn't provide advantage over base 2^44 if you look -# at amount of multiply-n-accumulate operations. Secondly, it makes it -# impossible to pre-compute multiples of 5 [referred to as s[]/sN in -# reference implementations], which means that more such operations -# would have to be performed in inner loop, which in turn makes critical -# path longer. In other words, even though base 2^44 reduction might -# look less elegant, overall critical path is actually shorter... - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^44 -# unsigned __int64 s[2]; # key value*20 base 2^44 -# unsigned __int64 r[3]; # key value base 2^44 -# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; -# # r^n positions reflect -# # placement in register, not -# # memory, R[3] is R[1]*20 - -$code.=<<___; -.type poly1305_init_base2_44,\@function,3 -.align 32 -poly1305_init_base2_44: - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - -.Linit_base2_44: - lea poly1305_blocks_vpmadd52(%rip),%r10 - lea poly1305_emit_base2_44(%rip),%r11 - - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - mov \$0x00000fffffffffff,%r8 - and 8($inp),%rcx - mov \$0x00000fffffffffff,%r9 - and %rax,%r8 - shrd \$44,%rcx,%rax - mov %r8,40($ctx) # r0 - and %r9,%rax - shr \$24,%rcx - mov %rax,48($ctx) # r1 - lea (%rax,%rax,4),%rax # *5 - mov %rcx,56($ctx) # r2 - shl \$2,%rax # magic <<2 - lea (%rcx,%rcx,4),%rcx # *5 - shl \$2,%rcx # magic <<2 - mov %rax,24($ctx) # s1 - mov %rcx,32($ctx) # s2 - movq \$-1,64($ctx) # write impossible value -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax - RET -.size poly1305_init_base2_44,.-poly1305_init_base2_44 -___ -{ -my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); -my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); -my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52,\@function,4 -.align 32 -poly1305_blocks_vpmadd52: - shr \$4,$len - jz .Lno_data_vpmadd52 # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - # if powers of the key are not calculated yet, process up to 3 - # blocks with this single-block subroutine, otherwise ensure that - # length is divisible by 2 blocks and pass the rest down to next - # subroutine... - - mov \$3,%rax - mov \$1,%r10 - cmp \$4,$len # is input long - cmovae %r10,%rax - test %r8,%r8 # is power value impossible? - cmovns %r10,%rax - - and $len,%rax # is input of favourable length? - jz .Lblocks_vpmadd52_4x - - sub %rax,$len - mov \$7,%r10d - mov \$1,%r11d - kmovw %r10d,%k7 - lea .L2_44_inp_permd(%rip),%r10 - kmovw %r11d,%k1 - - vmovq $padbit,%x#$PAD - vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd - vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift - vpermq \$0xcf,$PAD,$PAD - vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask - - vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value - vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys - vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} - vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} - - vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt - vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft - - jmp .Loop_vpmadd52 - -.align 32 -.Loop_vpmadd52: - vmovdqu32 0($inp),%x#$T0 # load input as ----3210 - lea 16($inp),$inp - - vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 - vpsrlvq $inp_shift,$T0,$T0 - vpandq $reduc_mask,$T0,$T0 - vporq $PAD,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo # accumulate input - - vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value - vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} - vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} - - vpxord $Dlo,$Dlo,$Dlo - vpxord $Dhi,$Dhi,$Dhi - - vpmadd52luq $r2r1r0,$H0,$Dlo - vpmadd52huq $r2r1r0,$H0,$Dhi - - vpmadd52luq $r1r0s2,$H1,$Dlo - vpmadd52huq $r1r0s2,$H1,$Dhi - - vpmadd52luq $r0s2s1,$H2,$Dlo - vpmadd52huq $r0s2s1,$H2,$Dhi - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword - vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword - vpandq $reduc_mask,$Dlo,$Dlo - - vpaddq $T0,$Dhi,$Dhi - - vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword - - vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word - vpandq $reduc_mask,$Dlo,$Dlo - - vpermq \$0b10010011,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} - - vpaddq $T0,$Dlo,$Dlo - vpsllq \$2,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - dec %rax # len-=16 - jnz .Loop_vpmadd52 - - vmovdqu64 $Dlo,0($ctx){%k7} # store hash value - - test $len,$len - jnz .Lblocks_vpmadd52_4x - -.Lno_data_vpmadd52: - RET -.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 -___ -} -{ -######################################################################## -# As implied by its name 4x subroutine processes 4 blocks in parallel -# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power -# and is handled in 256-bit %ymm registers. - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_4x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_4x: - shr \$4,$len - jz .Lno_data_vpmadd52_4x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - -.Lblocks_vpmadd52_4x: - vpbroadcastq $padbit,$PAD - - vmovdqa64 .Lx_mask44(%rip),$mask44 - mov \$5,%eax - vmovdqa64 .Lx_mask42(%rip),$mask42 - kmovw %eax,%k1 # used in 2x path - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Lblocks_vpmadd52_2x_do - -.Lblocks_vpmadd52_4x_do: - vpbroadcastq 64($ctx),$R0 # load 4th power of the key - vpbroadcastq 96($ctx),$R1 - vpbroadcastq 128($ctx),$R2 - vpbroadcastq 160($ctx),$S1 - -.Lblocks_vpmadd52_4x_key_loaded: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - test \$7,$len # is len 8*n? - jz .Lblocks_vpmadd52_8x - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 3-1-2-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$4,$len - jz .Ltail_vpmadd52_4x - jmp .Loop_vpmadd52_4x - ud2 - -.align 32 -.Linit_vpmadd52: - vmovq 24($ctx),%x#$S1 # load key - vmovq 56($ctx),%x#$H2 - vmovq 32($ctx),%x#$S2 - vmovq 40($ctx),%x#$R0 - vmovq 48($ctx),%x#$R1 - - vmovdqa $R0,$H0 - vmovdqa $R1,$H1 - vmovdqa $H2,$R2 - - mov \$2,%eax - -.Lmul_init_vpmadd52: - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - dec %eax - jz .Ldone_init_vpmadd52 - - vpunpcklqdq $R1,$H1,$R1 # 1,2 - vpbroadcastq %x#$H1,%x#$H1 # 2,2 - vpunpcklqdq $R2,$H2,$R2 - vpbroadcastq %x#$H2,%x#$H2 - vpunpcklqdq $R0,$H0,$R0 - vpbroadcastq %x#$H0,%x#$H0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R1,$S1,$S1 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S1,$S1 - vpsllq \$2,$S2,$S2 - - jmp .Lmul_init_vpmadd52 - ud2 - -.align 32 -.Ldone_init_vpmadd52: - vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 - vinserti128 \$1,%x#$R2,$H2,$R2 - vinserti128 \$1,%x#$R0,$H0,$R0 - - vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 - vpermq \$0b11011000,$R2,$R2 - vpermq \$0b11011000,$R0,$R0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpaddq $R1,$S1,$S1 - vpsllq \$2,$S1,$S1 - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Ldone_init_vpmadd52_2x - - vmovdqu64 $R0,64($ctx) # save key powers - vpbroadcastq %x#$R0,$R0 # broadcast 4th power - vmovdqu64 $R1,96($ctx) - vpbroadcastq %x#$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpbroadcastq %x#$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpbroadcastq %x#$S1,$S1 - - jmp .Lblocks_vpmadd52_4x_key_loaded - ud2 - -.align 32 -.Ldone_init_vpmadd52_2x: - vmovdqu64 $R0,64($ctx) # save key powers - vpsrldq \$8,$R0,$R0 # 0-1-0-2 - vmovdqu64 $R1,96($ctx) - vpsrldq \$8,$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpsrldq \$8,$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpsrldq \$8,$S1,$S1 - jmp .Lblocks_vpmadd52_2x_key_loaded - ud2 - -.align 32 -.Lblocks_vpmadd52_2x_do: - vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers - vmovdqu64 160+8($ctx),${S1}{%k1}{z} - vmovdqu64 64+8($ctx),${R0}{%k1}{z} - vmovdqu64 96+8($ctx),${R1}{%k1}{z} - -.Lblocks_vpmadd52_2x_key_loaded: - vmovdqu64 16*0($inp),$T2 # load data - vpxorq $T3,$T3,$T3 - lea 16*2($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as x-1-x-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - jmp .Ltail_vpmadd52_2x - ud2 - -.align 32 -.Loop_vpmadd52_4x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$4,$len # len-=64 - jnz .Loop_vpmadd52_4x - -.Ltail_vpmadd52_4x: - vmovdqu64 128($ctx),$R2 # load all key powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - -.Ltail_vpmadd52_2x: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - # at this point $len is - # either 4*n+2 or 0... - sub \$2,$len # len-=32 - ja .Lblocks_vpmadd52_4x_do - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_4x: - RET -.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x -___ -} -{ -######################################################################## -# As implied by its name 8x subroutine processes 8 blocks in parallel... -# This is intermediate version, as it's used only in cases when input -# length is either 8*n, 8*n+1 or 8*n+2... - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); -my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_8x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_8x: - shr \$4,$len - jz .Lno_data_vpmadd52_8x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - vmovdqa64 .Lx_mask44(%rip),$mask44 - vmovdqa64 .Lx_mask42(%rip),$mask42 - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - -.Lblocks_vpmadd52_8x: - ################################################################ - # fist we calculate more key powers - - vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - vpbroadcastq %x#$R2,$RR2 # broadcast 4th power - vpbroadcastq %x#$R0,$RR0 - vpbroadcastq %x#$R1,$RR1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $RR2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $RR2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $RR2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $RR2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $RR2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $RR2,$R0,$D2hi - - vpmadd52luq $RR0,$R0,$D0lo - vpmadd52huq $RR0,$R0,$D0hi - vpmadd52luq $RR0,$R1,$D1lo - vpmadd52huq $RR0,$R1,$D1hi - vpmadd52luq $RR0,$R2,$D2lo - vpmadd52huq $RR0,$R2,$D2hi - - vpmadd52luq $RR1,$S2,$D0lo - vpmadd52huq $RR1,$S2,$D0hi - vpmadd52luq $RR1,$R0,$D1lo - vpmadd52huq $RR1,$R0,$D1hi - vpmadd52luq $RR1,$R1,$D2lo - vpmadd52huq $RR1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$RR0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$RR1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$RR2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - - vpsrlq \$44,$RR0,$tmp # additional step - vpandq $mask44,$RR0,$RR0 - - vpaddq $tmp,$RR1,$RR1 - - ################################################################ - # At this point Rx holds 1324 powers, RRx - 5768, and the goal - # is 15263748, which reflects how data is loaded... - - vpunpcklqdq $R2,$RR2,$T2 # 3748 - vpunpckhqdq $R2,$RR2,$R2 # 1526 - vpunpcklqdq $R0,$RR0,$T0 - vpunpckhqdq $R0,$RR0,$R0 - vpunpcklqdq $R1,$RR1,$T1 - vpunpckhqdq $R1,$RR1,$R1 -___ -######## switch to %zmm -map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); -map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); - -$code.=<<___; - vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 - vshufi64x2 \$0x44,$R0,$T0,$RR0 - vshufi64x2 \$0x44,$R1,$T1,$RR1 - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - - vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 - vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 - vpaddq $RR2,$SS2,$SS2 - vpaddq $RR1,$SS1,$SS1 - vpsllq \$2,$SS2,$SS2 - vpsllq \$2,$SS1,$SS1 - - vpbroadcastq $padbit,$PAD - vpbroadcastq %x#$mask44,$mask44 - vpbroadcastq %x#$mask42,$mask42 - - vpbroadcastq %x#$SS1,$S1 # broadcast 8th power - vpbroadcastq %x#$SS2,$S2 - vpbroadcastq %x#$RR0,$R0 - vpbroadcastq %x#$RR1,$R1 - vpbroadcastq %x#$RR2,$R2 - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 73625140 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$8,$len - jz .Ltail_vpmadd52_8x - jmp .Loop_vpmadd52_8x - -.align 32 -.Loop_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$8,$len # len-=128 - jnz .Loop_vpmadd52_8x - -.Ltail_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$SS1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$SS1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$SS2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$SS2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$RR0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$RR0,$D2hi - - vpmadd52luq $H0,$RR0,$D0lo - vpmadd52huq $H0,$RR0,$D0hi - vpmadd52luq $H0,$RR1,$D1lo - vpmadd52huq $H0,$RR1,$D1hi - vpmadd52luq $H0,$RR2,$D2lo - vpmadd52huq $H0,$RR2,$D2hi - - vpmadd52luq $H1,$SS2,$D0lo - vpmadd52huq $H1,$SS2,$D0hi - vpmadd52luq $H1,$RR0,$D1lo - vpmadd52huq $H1,$RR0,$D1hi - vpmadd52luq $H1,$RR1,$D2lo - vpmadd52huq $H1,$RR1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vextracti64x4 \$1,$D0lo,%y#$T0 - vextracti64x4 \$1,$D0hi,%y#$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vextracti64x4 \$1,$D1lo,%y#$T1 - vextracti64x4 \$1,$D1hi,%y#$H1 - vextracti64x4 \$1,$D2lo,%y#$T2 - vextracti64x4 \$1,$D2hi,%y#$H2 -___ -######## switch back to %ymm -map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); - -$code.=<<___; - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - ################################################################ - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_8x: - RET -.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x -___ -} -$code.=<<___; -.type poly1305_emit_base2_44,\@function,3 -.align 32 -poly1305_emit_base2_44: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r9,%rax - shr \$20,%r9 - shl \$44,%rax - mov %r10,%rcx - shr \$40,%r10 - shl \$24,%rcx - - add %rax,%r8 - adc %rcx,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 -___ -} } } -} - -if (!$kernel) -{ # chacha20-poly1305 helpers -my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order -$code.=<<___; -.globl xor128_encrypt_n_pad -.type xor128_encrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_encrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_enc - nop -.Loop_enc_xmm: - movdqu ($inp,$otp),%xmm0 - pxor ($otp),%xmm0 - movdqu %xmm0,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_enc_xmm - - and \$15,%r10 # len % 16 - jz .Ldone_enc - -.Ltail_enc: - mov \$16,$len - sub %r10,$len - xor %eax,%eax -.Loop_enc_byte: - mov ($inp,$otp),%al - xor ($otp),%al - mov %al,($out,$otp) - mov %al,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_enc_byte - - xor %eax,%eax -.Loop_enc_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_enc_pad - -.Ldone_enc: - mov $otp,%rax - RET -.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad - -.globl xor128_decrypt_n_pad -.type xor128_decrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_decrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_dec - nop -.Loop_dec_xmm: - movdqu ($inp,$otp),%xmm0 - movdqa ($otp),%xmm1 - pxor %xmm0,%xmm1 - movdqu %xmm1,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_dec_xmm - - pxor %xmm1,%xmm1 - and \$15,%r10 # len % 16 - jz .Ldone_dec - -.Ltail_dec: - mov \$16,$len - sub %r10,$len - xor %eax,%eax - xor %r11d,%r11d -.Loop_dec_byte: - mov ($inp,$otp),%r11b - mov ($otp),%al - xor %r11b,%al - mov %al,($out,$otp) - mov %r11b,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_dec_byte - - xor %eax,%eax -.Loop_dec_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_dec_pad - -.Ldone_dec: - mov $otp,%rax - RET -.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad -___ -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - RET -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch - - .rva .LSEH_begin_poly1305_blocks_x86_64 - .rva .LSEH_end_poly1305_blocks_x86_64 - .rva .LSEH_info_poly1305_blocks_x86_64 - - .rva .LSEH_begin_poly1305_emit_x86_64 - .rva .LSEH_end_poly1305_emit_x86_64 - .rva .LSEH_info_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_poly1305_blocks_avx512 - .rva .LSEH_end_poly1305_blocks_avx512 - .rva .LSEH_info_poly1305_blocks_avx512 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_poly1305_block_init_arch: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch - -.LSEH_info_poly1305_blocks_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_poly1305_emit_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -$code.=<<___ if ($avx>2); -.LSEH_info_poly1305_blocks_avx512: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] -___ -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c deleted file mode 100644 index b7e78a583e07..000000000000 --- a/arch/x86/lib/crypto/poly1305_glue.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> -#include <crypto/internal/poly1305.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/unaligned.h> - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) -{ - struct poly1305_arch_internal *ctx = - container_of(&state->h.h, struct poly1305_arch_internal, h); - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx)) { - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const unsigned int bytes = min(len, SZ_4K); - - kernel_fpu_begin(); - if (static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return 0; -} -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); -MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S deleted file mode 100644 index 0d7b2c3e45d9..000000000000 --- a/arch/x86/lib/crypto/sha256-avx-asm.S +++ /dev/null @@ -1,499 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX1 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 block at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -.macro MY_ROR p1 p2 - shld $(32-(\p1)), \p2, \p2 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 -XTMP5 = %xmm11 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm13 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpsrld $7, XTMP1, XTMP2 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpslld $(32-7), XTMP1, XTMP3 - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (22-13), y1 # y1 = a >> (22-13) - vpsrld $18, XTMP1, XTMP2 # - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpslld $(32-18), XTMP1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP1, XTMP3, XTMP3 # - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP3, XTMP2, XTMP2 # - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP3, XTMP2, XTMP2 - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER # - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_avx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rsp, %rbp - - subq $STACK_SIZE, %rsp # allocate stack space - and $~15, %rsp # align stack pointer - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, _INP_END(%rsp) - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 1*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 2*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 3*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd 1*16(TBL), X1, XFER - vmovdqa XFER, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X2, X0 - vmovdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_avx) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S deleted file mode 100644 index 25d3380321ec..000000000000 --- a/arch/x86/lib/crypto/sha256-avx2-asm.S +++ /dev/null @@ -1,774 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 2 blocks at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -X0 = %ymm4 -X1 = %ymm5 -X2 = %ymm6 -X3 = %ymm7 - -# XMM versions of above -XWORD0 = %xmm4 -XWORD1 = %xmm5 -XWORD2 = %xmm6 -XWORD3 = %xmm7 - -XTMP0 = %ymm0 -XTMP1 = %ymm1 -XTMP2 = %ymm2 -XTMP3 = %ymm3 -XTMP4 = %ymm8 -XFER = %ymm9 -XTMP5 = %ymm11 - -SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %ymm13 - -X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg -c = %ecx -d = %r8d -e = %edx # clobbers NUM_BLKS -y3 = %esi # clobbers INP - -SRND = CTX # SRND is same register as CTX - -a = %eax -b = %ebx -f = %r9d -g = %r10d -h = %r11d -old_h = %r11d - -T1 = %r12d -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -STACK_SIZE = _CTX + _CTX_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs - X_ = X0 - X0 = X1 - X1 = X2 - X2 = X3 - X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED disp -################################### RND N + 0 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - vpsrld $7, XTMP1, XTMP2 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - vpslld $(32-7), XTMP1, XTMP3 - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 - - vpsrld $18, XTMP1, XTMP2 - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 1 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 1*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - vpslld $(32-18), XTMP1, XTMP1 - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - - vpxor XTMP1, XTMP3, XTMP3 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - - - ROTATE_ARGS - -################################### RND N + 2 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - offset = \disp + 2*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - rorx $11, e, y1 # y1 = e >> 11 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - and e, y2 # y2 = (f^g)&e # CH - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - vpxor XTMP3, XTMP2, XTMP2 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a ,T1 # T1 = (a >> 2) # S0 - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1,h # h = k + w + h + S0 # -- - add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3,h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 3 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 3*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP3, XTMP2, XTMP2 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $2, a, T1 # T1 = (a >> 2) # S0 - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS disp -################################### RND N + 0 ########################### - - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 1 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*1 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 2 ############################## - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*2 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 3 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*3 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - -.endm - -######################################################################## -## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_rorx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - push %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $-32, %rsp # align rsp to 32 byte boundary - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block - mov NUM_BLKS, _INP_END(%rsp) - - cmp NUM_BLKS, INP - je .Lonly_one_block - - ## load initial digest - mov (CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - -.Lloop0: - ## Load first 16 dwords from two blocks - VMOVDQ 0*32(INP),XTMP0 - VMOVDQ 1*32(INP),XTMP1 - VMOVDQ 2*32(INP),XTMP2 - VMOVDQ 3*32(INP),XTMP3 - - ## byte swap data - vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 - vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 - vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 - vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 - - ## transpose data into high/low halves - vperm2i128 $0x20, XTMP2, XTMP0, X0 - vperm2i128 $0x31, XTMP2, XTMP0, X1 - vperm2i128 $0x20, XTMP3, XTMP1, X2 - vperm2i128 $0x31, XTMP3, XTMP1, X3 - -.Llast_block_enter: - add $64, INP - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 12 each - xor SRND, SRND - -.align 16 -.Lloop1: - leaq K256+0*32(%rip), INP ## reuse INP as scratch reg - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) - - leaq K256+2*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) - - leaq K256+3*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) - - add $4*32, SRND - cmp $3*4*32, SRND - jb .Lloop1 - -.Lloop2: - ## Do last 16 rounds with no scheduling - leaq K256+0*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X1, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 1*32) - add $2*32, SRND - - vmovdqa X2, X0 - vmovdqa X3, X1 - - cmp $4*4*32, SRND - jb .Lloop2 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - ja .Ldone_hash - - #### Do second block using previously scheduled results - xor SRND, SRND -.align 16 -.Lloop3: - DO_4ROUNDS (_XFER + 0*32 + 16) - DO_4ROUNDS (_XFER + 1*32 + 16) - add $2*32, SRND - cmp $4*4*32, SRND - jb .Lloop3 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - add $64, INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - jb .Lloop0 - ja .Ldone_hash - -.Ldo_last_block: - VMOVDQ 0*16(INP),XWORD0 - VMOVDQ 1*16(INP),XWORD1 - VMOVDQ 2*16(INP),XWORD2 - VMOVDQ 3*16(INP),XWORD3 - - vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 - vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 - vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 - vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - jmp .Llast_block_enter - -.Lonly_one_block: - - ## load initial digest - mov (4*0)(CTX),a - mov (4*1)(CTX),b - mov (4*2)(CTX),c - mov (4*3)(CTX),d - mov (4*4)(CTX),e - mov (4*5)(CTX),f - mov (4*6)(CTX),g - mov (4*7)(CTX),h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - jmp .Ldo_last_block - -.Ldone_hash: - - mov %rbp, %rsp - pop %rbp - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - vzeroupper - RET -SYM_FUNC_END(sha256_transform_rorx) - -.section .rodata.cst512.K256, "aM", @progbits, 512 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 - -# shuffle xBxA -> 00BA -.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 -.align 32 -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 - -# shuffle xDxC -> DC00 -.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 -.align 32 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S deleted file mode 100644 index d3548206cf3d..000000000000 --- a/arch/x86/lib/crypto/sha256-ni-asm.S +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/objtool.h> - -#define STATE_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 /* sha256rnds2 implicit operand */ -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define TMP %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -.macro do_4rounds i, m0, m1, m2, m3 -.if \i < 16 - movdqu \i*4(DATA_PTR), \m0 - pshufb SHUF_MASK, \m0 -.endif - movdqa (\i-32)*4(SHA256CONSTANTS), MSG - paddd \m0, MSG - sha256rnds2 STATE0, STATE1 -.if \i >= 12 && \i < 60 - movdqa \m0, TMP - palignr $4, \m3, TMP - paddd TMP, \m1 - sha256msg2 \m0, \m1 -.endif - punpckhqdq MSG, MSG - sha256rnds2 STATE1, STATE0 -.if \i >= 4 && \i < 52 - sha256msg1 \m0, \m3 -.endif -.endm - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 block function - * - * This function takes a pointer to the current SHA-256 state, a pointer to the - * input data, and the number of 64-byte blocks to process. Once all blocks - * have been processed, the state is updated with the new state. This function - * only processes complete blocks. State initialization, buffering of partial - * blocks, and digest finalization is expected to be handled elsewhere. - * - * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ -.text -SYM_FUNC_START(sha256_ni_transform) - ANNOTATE_NOENDBR # since this is called only via static_call - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ - movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ - - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* FEBA */ - punpckhqdq TMP, STATE1 /* DCHG */ - pshufd $0x1B, STATE0, STATE0 /* ABEF */ - pshufd $0xB1, STATE1, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256+32*4(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - -.irp i, 0, 16, 32, 48 - do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 - do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 - do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 - do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 -.endr - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* GHEF */ - punpckhqdq TMP, STATE1 /* ABCD */ - pshufd $0xB1, STATE0, STATE0 /* HGFE */ - pshufd $0x1B, STATE1, STATE1 /* DCBA */ - - movdqu STATE1, 0*16(STATE_PTR) - movdqu STATE0, 1*16(STATE_PTR) - -.Ldone_hash: - - RET -SYM_FUNC_END(sha256_ni_transform) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S deleted file mode 100644 index 7f24a4cdcb25..000000000000 --- a/arch/x86/lib/crypto/sha256-ssse3-asm.S +++ /dev/null @@ -1,511 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define MOVDQ movdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p2, \p1 - pshufb \p3, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm12 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - movdqa X3, XTMP0 - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - palignr $4, X2, XTMP0 # XTMP0 = W[-7] - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa X1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - palignr $4, X0, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - movdqa XTMP1, XTMP2 # XTMP2 = W[-15] - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH - movdqa XTMP1, XTMP3 # XTMP3 = W[-15] - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pslld $(32-7), XTMP1 # - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - psrld $7, XTMP2 # - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP3, XTMP2 # XTMP2 = W[-15] - mov e, y0 # y0 = e - mov a, y1 # y1 = a - movdqa XTMP3, XTMP4 # XTMP4 = W[-15] - ror $(25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(22-13), y1 # y1 = a >> (22-13) - pslld $(32-18), XTMP3 # - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - psrld $18, XTMP2 # - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pxor XTMP4, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} - mov e, y0 # y0 = e - mov a, y1 # y1 = a - ror $(25-11), y0 # y0 = e >> (25-11) - movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP2 - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - movdqa XTMP2, X0 # X0 = W[-2] {DDCC} - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 - xor g, y2 # y2 = CH = ((f^g)&e)^g - pxor XTMP3, XTMP2 # - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, X0 # X0 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_DC00, X0 # X0 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_ssse3) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $~15, %rsp - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS - mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - movdqa _SHUF_00BA(%rip), SHUF_00BA - movdqa _SHUF_DC00(%rip), SHUF_DC00 - -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - movdqa (TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 1*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 2*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 3*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - paddd (TBL), X0 - movdqa X0, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd 1*16(TBL), X1 - movdqa X1, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X2, X0 - movdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - - RET -SYM_FUNC_END(sha256_transform_ssse3) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c deleted file mode 100644 index 80380f8fdcee..000000000000 --- a/arch/x86/lib/crypto/sha256.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for x86_64 - * - * Copyright 2025 Google LLC - */ -#include <asm/fpu/api.h> -#include <crypto/internal/sha2.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/static_call.h> - -asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); - -DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_x86)) { - kernel_fpu_begin(); - static_call(sha256_blocks_x86)(state, data, nblocks); - kernel_fpu_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_x86); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_x86_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - static_call_update(sha256_blocks_x86, sha256_ni_transform); - } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | - XFEATURE_MASK_YMM, NULL) && - boot_cpu_has(X86_FEATURE_AVX)) { - if (boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - static_call_update(sha256_blocks_x86, - sha256_transform_rorx); - else - static_call_update(sha256_blocks_x86, - sha256_transform_avx); - } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { - return 0; - } - static_branch_enable(&have_sha256_x86); - return 0; -} -subsys_initcall(sha256_x86_mod_init); - -static void __exit sha256_x86_mod_exit(void) -{ -} -module_exit(sha256_x86_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 89079ea73e65..a4700ef6eb64 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) st->prot_levels[level] = effective; } +static void effective_prot_pte(struct ptdump_state *st, pte_t pte) +{ + effective_prot(st, 4, pte_val(pte)); +} + +static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd) +{ + effective_prot(st, 3, pmd_val(pmd)); +} + +static void effective_prot_pud(struct ptdump_state *st, pud_t pud) +{ + effective_prot(st, 2, pud_val(pud)); +} + +static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d) +{ + effective_prot(st, 1, p4d_val(p4d)); +} + +static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd) +{ + effective_prot(st, 0, pgd_val(pgd)); +} + + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + bool ptdump_walk_pgd_level_core(struct seq_file *m, struct mm_struct *mm, pgd_t *pgd, bool checkwx, bool dmesg) @@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { - .note_page = note_page, - .effective_prot = effective_prot, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .effective_prot_pte = effective_prot_pte, + .effective_prot_pmd = effective_prot_pmd, + .effective_prot_pud = effective_prot_pud, + .effective_prot_p4d = effective_prot_p4d, + .effective_prot_pgd = effective_prot_pgd, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index bf8dab18be97..2fdc1f1f5adb 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -122,13 +122,12 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup, static bool ex_handler_fprestore(const struct exception_table_entry *fixup, struct pt_regs *regs) { - regs->ip = ex_fixup_addr(fixup); - WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); fpu_reset_from_exception_fixup(); - return true; + + return ex_handler_default(fixup, regs); } /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 607d6a2e66e2..8a34fff6ab2b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,7 +30,6 @@ #include <linux/initrd.h> #include <linux/cpumask.h> #include <linux/gfp.h> -#include <linux/execmem.h> #include <asm/asm.h> #include <asm/bios_ebda.h> @@ -749,8 +748,6 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 66330fe4e18c..76e33bd7c556 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,7 +34,6 @@ #include <linux/gfp.h> #include <linux/kcore.h> #include <linux/bootmem_info.h> -#include <linux/execmem.h> #include <asm/processor.h> #include <asm/bios_ebda.h> @@ -806,7 +805,7 @@ kernel_physical_mapping_change(unsigned long paddr_start, } #ifndef CONFIG_NUMA -static inline void x86_numa_init(void) +static __always_inline void x86_numa_init(void) { memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } @@ -1392,8 +1391,6 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; /* @@ -1467,16 +1464,21 @@ static unsigned long probe_memory_block_size(void) } /* - * Use max block size to minimize overhead on bare metal, where - * alignment for memory hotplug isn't a concern. + * When hotplug alignment is not a concern, maximize blocksize + * to minimize overhead. Otherwise, align to the lesser of advice + * alignment and end of memory alignment. */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = memory_block_advised_max_size(); + if (!bz) { bz = MAX_BLOCK_SIZE; - goto done; + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + goto done; + } else { + bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE); } /* Find the largest allowed block size that aligns to memory end */ - for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 331e101bf801..12c8180ca1ba 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -71,7 +71,7 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, static unsigned int __ioremap_check_ram(struct resource *res) { unsigned long start_pfn, stop_pfn; - unsigned long i; + unsigned long pfn; if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) return 0; @@ -79,9 +79,8 @@ static unsigned int __ioremap_check_ram(struct resource *res) start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; stop_pfn = (res->end + 1) >> PAGE_SHIFT; if (stop_pfn > start_pfn) { - for (i = 0; i < (stop_pfn - start_pfn); ++i) - if (pfn_valid(start_pfn + i) && - !PageReserved(pfn_to_page(start_pfn + i))) + for_each_valid_pfn(pfn, start_pfn, stop_pfn) + if (!PageReserved(pfn_to_page(pfn))) return IORES_MAP_SYSTEM_RAM; } diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c97b527c66fe..2e7923844afe 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -775,6 +775,12 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } +static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm) +{ + *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) | + cachemode2protval(pcm)); +} + int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { @@ -789,8 +795,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); return 1; } @@ -831,8 +836,7 @@ int memtype_kernel_map_sync(u64 base, unsigned long size, * Reserved non RAM regions only and after successful memtype_reserve, * this func also keeps identity mapping (if any) in sync with this new prot. */ -static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, - int strict_prot) +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot) { int is_ram = 0; int ret; @@ -858,9 +862,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); - *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); } return 0; } @@ -870,8 +872,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, return ret; if (pcm != want_pcm) { - if (strict_prot || - !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { + if (!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { memtype_free(paddr, paddr + size); pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, @@ -881,13 +882,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, cattr_name(pcm)); return -EINVAL; } - /* - * We allow returning different type than the one requested in - * non strict case. - */ - *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); } if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { @@ -910,124 +905,14 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } -static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, - resource_size_t *phys) -{ - struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; - - if (follow_pfnmap_start(&args)) - return -EINVAL; - - /* Never return PFNs of anon folios in COW mappings. */ - if (!args.special) { - follow_pfnmap_end(&args); - return -EINVAL; - } - - *prot = pgprot_val(args.pgprot); - *phys = (resource_size_t)args.pfn << PAGE_SHIFT; - follow_pfnmap_end(&args); - return 0; -} - -static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, - pgprot_t *pgprot) -{ - unsigned long prot; - - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); - - /* - * We need the starting PFN and cachemode used for track_pfn_remap() - * that covered the whole VMA. For most mappings, we can obtain that - * information from the page tables. For COW mappings, we might now - * suddenly have anon folios mapped and follow_phys() will fail. - * - * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to - * detect the PFN. If we need the cachemode as well, we're out of luck - * for now and have to fail fork(). - */ - if (!follow_phys(vma, &prot, paddr)) { - if (pgprot) - *pgprot = __pgprot(prot); - return 0; - } - if (is_cow_mapping(vma->vm_flags)) { - if (pgprot) - return -EINVAL; - *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return 0; - } - WARN_ON_ONCE(1); - return -EINVAL; -} - -int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn) -{ - const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; - resource_size_t paddr; - pgprot_t pgprot; - int rc; - - if (!(src_vma->vm_flags & VM_PAT)) - return 0; - - /* - * Duplicate the PAT information for the dst VMA based on the src - * VMA. - */ - if (get_pat_info(src_vma, &paddr, &pgprot)) - return -EINVAL; - rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); - if (rc) - return rc; - - /* Reservation for the destination VMA succeeded. */ - vm_flags_set(dst_vma, VM_PAT); - *pfn = PHYS_PFN(paddr); - return 0; -} - -void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) -{ - untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); - /* - * Reservation was freed, any copied page tables will get cleaned - * up later, but without getting PAT involved again. - */ -} - -/* - * prot is passed in as a parameter for the new mapping. If the vma has - * a linear pfn mapping for the entire range, or no vma is provided, - * reserve the entire pfn + size range with single reserve_pfn_range - * call. - */ -int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, unsigned long size) +int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; enum page_cache_mode pcm; - /* reserve the whole chunk starting from paddr */ - if (!vma || (addr == vma->vm_start - && size == (vma->vm_end - vma->vm_start))) { - int ret; - - ret = reserve_pfn_range(paddr, size, prot, 0); - if (ret == 0 && vma) - vm_flags_set(vma, VM_PAT); - return ret; - } - if (!pat_enabled()) return 0; - /* - * For anything smaller than the vma size we set prot based on the - * lookup. - */ pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ @@ -1038,70 +923,35 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); - + pgprot_set_cachemode(prot, pcm); return 0; } -void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) +int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { - enum page_cache_mode pcm; + const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - if (!pat_enabled()) - return; - - /* Set prot based on lookup */ - pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + return reserve_pfn_range(paddr, size, prot); } -/* - * untrack_pfn is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case pfn, size are zero). - */ -void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size, bool mm_wr_locked) +void pfnmap_untrack(unsigned long pfn, unsigned long size) { - resource_size_t paddr; - - if (vma && !(vma->vm_flags & VM_PAT)) - return; + const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - /* free the chunk starting from pfn or the whole chunk */ - paddr = (resource_size_t)pfn << PAGE_SHIFT; - if (!paddr && !size) { - if (get_pat_info(vma, &paddr, NULL)) - return; - size = vma->vm_end - vma->vm_start; - } free_pfn_range(paddr, size); - if (vma) { - if (mm_wr_locked) - vm_flags_clear(vma, VM_PAT); - else - __vm_flags_mod(vma, 0, VM_PAT); - } -} - -void untrack_pfn_clear(struct vm_area_struct *vma) -{ - vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) { - return __pgprot(pgprot_val(prot) | - cachemode2protval(_PAGE_CACHE_MODE_WC)); + pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WC); + return prot; } EXPORT_SYMBOL_GPL(pgprot_writecombine); pgprot_t pgprot_writethrough(pgprot_t prot) { - return __pgprot(pgprot_val(prot) | - cachemode2protval(_PAGE_CACHE_MODE_WT)); + pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WT); + return prot; } EXPORT_SYMBOL_GPL(pgprot_writethrough); diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c index 645613d59942..e5844ed1311e 100644 --- a/arch/x86/mm/pat/memtype_interval.c +++ b/arch/x86/mm/pat/memtype_interval.c @@ -49,32 +49,6 @@ INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_match(u64 start, u64 end, int match_type) -{ - struct memtype *entry_match; - - entry_match = interval_iter_first(&memtype_rbroot, start, end-1); - - while (entry_match != NULL && entry_match->start < end) { - if ((match_type == MEMTYPE_EXACT_MATCH) && - (entry_match->start == start) && (entry_match->end == end)) - return entry_match; - - if ((match_type == MEMTYPE_END_MATCH) && - (entry_match->start < start) && (entry_match->end == end)) - return entry_match; - - entry_match = interval_iter_next(entry_match, start, end-1); - } - - return NULL; /* Returns NULL if there is no match */ -} - static int memtype_check_conflict(u64 start, u64 end, enum page_cache_mode reqtype, enum page_cache_mode *newtype) @@ -130,35 +104,16 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty struct memtype *memtype_erase(u64 start, u64 end) { - struct memtype *entry_old; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH); - if (!entry_old) { - entry_old = memtype_match(start, end, MEMTYPE_END_MATCH); - if (!entry_old) - return ERR_PTR(-EINVAL); + struct memtype *entry = interval_iter_first(&memtype_rbroot, start, end - 1); + + while (entry && entry->start < end) { + if (entry->start == start && entry->end == end) { + interval_remove(entry, &memtype_rbroot); + return entry; + } + entry = interval_iter_next(entry, start, end - 1); } - - if (entry_old->start == start) { - /* munmap: erase this node */ - interval_remove(entry_old, &memtype_rbroot); - } else { - /* mremap: update the end value of this node */ - interval_remove(entry_old, &memtype_rbroot); - entry_old->end = start; - interval_insert(entry_old, &memtype_rbroot); - - return NULL; - } - - return entry_old; + return ERR_PTR(-EINVAL); } struct memtype *memtype_lookup(u64 addr) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 30ab4aced761..8834c76f91c9 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, pgprot_t pgprot; int i = 0; + if (!cpu_feature_enabled(X86_FEATURE_PSE)) + return 0; + addr &= PMD_MASK; pte = pte_offset_kernel(pmd, addr); first = *pte; @@ -2148,6 +2151,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, CPA_PAGES_ARRAY, pages); } +/* + * __set_memory_prot is an internal helper for callers that have been passed + * a pgprot_t value from upper layers and a reservation has already been taken. + * If you want to set the pgprot to a specific page protocol, use the + * set_memory_xx() functions. + */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) +{ + return change_page_attr_set_clr(&addr, numpages, prot, + __pgprot(~pgprot_val(prot)), 0, 0, + NULL); +} + int _set_memory_uc(unsigned long addr, int numpages) { /* diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 62777ba4de1a..ddf248c3ee7d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -189,7 +189,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (!ptdesc) failed = true; - if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; @@ -751,14 +751,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); - pagetable_dtor(virt_to_ptdesc(pmd)); - free_page((unsigned long)pmd); + pmd_free(&init_mm, pmd); return 1; } @@ -781,7 +780,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); return 1; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 190299834011..b10d4d131dce 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -38,6 +38,7 @@ #include <asm/desc.h> #include <asm/sections.h> #include <asm/set_memory.h> +#include <asm/bugs.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt @@ -84,7 +85,8 @@ void __init pti_check_boottime_disable(void) return; } - if (cpu_mitigations_off()) + if (pti_mode == PTI_AUTO && + !cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) pti_mode = PTI_FORCE_OFF; if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); @@ -98,6 +100,11 @@ void __init pti_check_boottime_disable(void) return; setup_force_cpu_cap(X86_FEATURE_PTI); + + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + pr_debug("PTI enabled, disabling INVLPGB\n"); + setup_clear_cpu_cap(X86_FEATURE_INVLPGB); + } } static int __init pti_parse_cmdline(char *arg) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15672cb926fc..7e3fca164620 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } -static const char *bpf_get_prog_name(struct bpf_prog *prog) -{ - if (prog->aux->ksym.prog) - return prog->aux->ksym.name; - return prog->aux->name; -} - static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) { int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; @@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size if (stack_ptr[0] != PRIV_STACK_GUARD_VAL || stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) { pr_err("BPF private stack overflow/underflow detected for prog %sx\n", - bpf_get_prog_name(prog)); + bpf_jit_get_prog_name(prog)); break; } } @@ -3845,7 +3838,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp } return; #endif - WARN(1, "verification of programs using bpf_throw should have failed\n"); } void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 4933fb337983..c1efd5b0d198 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -8,13 +8,13 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o obj-$(CONFIG_PCI_XEN) += xen.o obj-y += fixup.o -obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_X86_NUMACHIP) += numachip.o +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o +obj-$(CONFIG_X86_INTEL_MID) += intel_mid.o -obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-y += common.o early.o obj-y += bus_numa.o diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 36336299596b..e7e71490bd25 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -970,13 +970,13 @@ static void amd_rp_pme_suspend(struct pci_dev *dev) struct pci_dev *rp; /* - * PM_SUSPEND_ON means we're doing runtime suspend, which means + * If system suspend is not in progress, we're doing runtime suspend, so * amd-pmc will not be involved so PMEs during D3 work as advertised. * * The PMEs *do* work if amd-pmc doesn't put the SoC in the hardware * sleep state, but we assume amd-pmc is always present. */ - if (pm_suspend_target_state == PM_SUSPEND_ON) + if (!pm_suspend_in_progress()) return; rp = pcie_find_root_port(dev); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid.c index b433b1753016..b433b1753016 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid.c diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index f8126821a94d..aaa7017416f7 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -5,19 +5,12 @@ * (C) Copyright 2010 Intel Corporation */ #include <linux/init.h> -#include <linux/kernel.h> -#include <linux/irq.h> #include <linux/reboot.h> -#include <linux/serial_reg.h> -#include <linux/serial_8250.h> #include <asm/ce4100.h> #include <asm/prom.h> #include <asm/setup.h> -#include <asm/i8259.h> #include <asm/io.h> -#include <asm/io_apic.h> -#include <asm/emergency-restart.h> /* * The CE4100 platform has an internal 8051 Microcontroller which is @@ -31,94 +24,6 @@ static void ce4100_power_off(void) outb(0x4, 0xcf9); } -#ifdef CONFIG_SERIAL_8250 - -static unsigned int mem_serial_in(struct uart_port *p, int offset) -{ - offset = offset << p->regshift; - return readl(p->membase + offset); -} - -/* - * The UART Tx interrupts are not set under some conditions and therefore serial - * transmission hangs. This is a silicon issue and has not been root caused. The - * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT - * bit of LSR register in interrupt handler to see whether at least one of these - * two bits is set, if so then process the transmit request. If this workaround - * is not applied, then the serial transmission may hang. This workaround is for - * errata number 9 in Errata - B step. -*/ - -static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset) -{ - unsigned int ret, ier, lsr; - - if (offset == UART_IIR) { - offset = offset << p->regshift; - ret = readl(p->membase + offset); - if (ret & UART_IIR_NO_INT) { - /* see if the TX interrupt should have really set */ - ier = mem_serial_in(p, UART_IER); - /* see if the UART's XMIT interrupt is enabled */ - if (ier & UART_IER_THRI) { - lsr = mem_serial_in(p, UART_LSR); - /* now check to see if the UART should be - generating an interrupt (but isn't) */ - if (lsr & (UART_LSR_THRE | UART_LSR_TEMT)) - ret &= ~UART_IIR_NO_INT; - } - } - } else - ret = mem_serial_in(p, offset); - return ret; -} - -static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) -{ - offset = offset << p->regshift; - writel(value, p->membase + offset); -} - -static void ce4100_serial_fixup(int port, struct uart_port *up, - u32 *capabilities) -{ -#ifdef CONFIG_EARLY_PRINTK - /* - * Over ride the legacy port configuration that comes from - * asm/serial.h. Using the ioport driver then switching to the - * PCI memmaped driver hangs the IOAPIC - */ - if (up->iotype != UPIO_MEM32) { - up->uartclk = 14745600; - up->mapbase = 0xdffe0200; - set_fixmap_nocache(FIX_EARLYCON_MEM_BASE, - up->mapbase & PAGE_MASK); - up->membase = - (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); - up->membase += up->mapbase & ~PAGE_MASK; - up->mapbase += port * 0x100; - up->membase += port * 0x100; - up->iotype = UPIO_MEM32; - up->regshift = 2; - up->irq = 4; - } -#endif - up->iobase = 0; - up->serial_in = ce4100_mem_serial_in; - up->serial_out = ce4100_mem_serial_out; - - *capabilities |= (1 << 12); -} - -static __init void sdv_serial_fixup(void) -{ - serial8250_set_isa_configurator(ce4100_serial_fixup); -} - -#else -static inline void sdv_serial_fixup(void) {}; -#endif - static void __init sdv_arch_setup(void) { sdv_serial_fixup(); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index e7e8f77f77f8..b4409df2105a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -216,8 +216,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * When SEV-ES is active, the GHCB as set by the kernel will be used * by firmware. Create a 1:1 unencrypted mapping for each GHCB. */ - if (sev_es_efi_map_ghcbs(pgd)) { - pr_err("Failed to create 1:1 mapping for the GHCBs!\n"); + if (sev_es_efi_map_ghcbs_cas(pgd)) { + pr_err("Failed to create 1:1 mapping for the GHCBs and CAs!\n"); return 1; } diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index a7c23f2a58c9..a2294c1649f6 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -192,7 +192,8 @@ out: int arch_resume_nosmt(void) { - int ret = 0; + int ret; + /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe @@ -206,18 +207,10 @@ int arch_resume_nosmt(void) * Called with hotplug disabled. */ cpu_hotplug_enable(); - if (cpu_smt_control == CPU_SMT_DISABLED || - cpu_smt_control == CPU_SMT_FORCE_DISABLED) { - enum cpuhp_smt_control old = cpu_smt_control; - - ret = cpuhp_smt_enable(); - if (ret) - goto out; - ret = cpuhp_smt_disable(old); - if (ret) - goto out; - } -out: + + ret = arch_cpu_rescan_dead_smt_siblings(); + cpu_hotplug_disable(); + return ret; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index ebdfd7b84feb..e0a607a14e7e 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -35,7 +35,7 @@ targets += purgatory.ro purgatory.chk PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0 PURGATORY_CFLAGS += -fpic -fvisibility=hidden -PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING +PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING PURGATORY_CFLAGS += -fno-stack-protector # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index aea47e793963..655139dd0532 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -25,7 +25,7 @@ static int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state sctx; + struct sha256_ctx sctx; sha256_init(&sctx); end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index ed5c63c0b4e5..88be32026768 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -66,6 +66,8 @@ void __init reserve_real_mode(void) * setup_arch(). */ memblock_reserve(0, SZ_1M); + + memblock_clear_kho_scratch(0, SZ_1M); } static void __init sme_sev_setup_real_mode(struct trampoline_header *th) diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 08cd913cbd4e..8bf15c4aefa9 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) pr_warn("Decoded and checked %d instructions with %d " "failures\n", insns, warnings); else - fprintf(stdout, "%s: success: Decoded and checked %d" + fprintf(stdout, " %s: success: Decoded and checked %d" " instructions\n", prog, insns); return 0; } diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 213f35f94feb..e743f0ea01ee 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -253,9 +253,9 @@ int main(int argc, char **argv) } fprintf((errors) ? stderr : stdout, - "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + " %s: %s: Decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, - (errors) ? "Failure" : "Success", + (errors) ? "failure" : "success", insns, (input_file) ? "given" : "random", errors, diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index b07824500363..ddc144657efa 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -20,6 +20,9 @@ */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); +/* Do not call this directly. Declared for export type visibility. */ +extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); + /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 478710384b34..e222d2ae28fd 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -21,10 +21,10 @@ #include <asm/user.h> -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +/* PAUSE is a good thing to insert into busy-wait loops. */ +static __always_inline void native_pause(void) { - __asm__ __volatile__("rep;nop": : :"memory"); + __asm__ __volatile__("pause": : :"memory"); } static __always_inline void cpu_relax(void) @@ -33,7 +33,7 @@ static __always_inline void cpu_relax(void) time_travel_mode == TT_MODE_EXTERNAL) time_travel_ndelay(1); else - rep_nop(); + native_pause(); } #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 56a2f0913e3c..d6208d0fad51 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -9,6 +9,8 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern const sys_call_ptr_t sys_call_table[]; + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index 37decaa74761..a21403df6663 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#include <sys/ucontext.h> #define __FRAME_OFFSETS +#include <linux/errno.h> +#include <linux/string.h> +#include <sys/ucontext.h> #include <asm/ptrace.h> +#include <asm/sigcontext.h> #include <sysdep/ptrace.h> #include <sysdep/mcontext.h> #include <arch.h> @@ -18,6 +21,10 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(UESP, ESP); /* sic */ COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#undef COPY2 +#undef COPY +#undef COPY_SEG +#undef COPY_SEG_CPL3 #else #define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y] #define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X] @@ -29,6 +36,8 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; +#undef COPY2 +#undef COPY #endif } @@ -42,3 +51,210 @@ void mc_set_rip(void *_mc, void *target) mc->gregs[REG_RIP] = (unsigned long)target; #endif } + +/* Same thing, but the copy macros are turned around. */ +void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, int single_stepping) +{ +#ifdef __i386__ +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X] +#define COPY_SEG(X) mc->gregs[REG_##X] = regs->gp[X] & 0xffff; +#define COPY_SEG_CPL3(X) mc->gregs[REG_##X] = (regs->gp[X] & 0xffff) | 3; + COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS); + COPY(EDI); COPY(ESI); COPY(EBP); + COPY2(UESP, ESP); /* sic */ + COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); + COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#else +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X/sizeof(unsigned long)] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X/sizeof(unsigned long)] + COPY(R8); COPY(R9); COPY(R10); COPY(R11); + COPY(R12); COPY(R13); COPY(R14); COPY(R15); + COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX); + COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP); + COPY(RIP); + COPY2(EFLAGS, EFL); + mc->gregs[REG_CSGSFS] = mc->gregs[REG_CSGSFS] & 0xffffffffffffl; + mc->gregs[REG_CSGSFS] |= (regs->gp[SS / sizeof(unsigned long)] & 0xffff) << 48; +#endif + + if (single_stepping) + mc->gregs[REG_EFL] |= X86_EFLAGS_TF; + else + mc->gregs[REG_EFL] &= ~X86_EFLAGS_TF; +} + +#ifdef CONFIG_X86_32 +struct _xstate_64 { + struct _fpstate_64 fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; + +/* Not quite the right structures as these contain more information */ +int um_i387_from_fxsr(struct _fpstate_32 *i387, + const struct _fpstate_64 *fxsave); +int um_fxsr_from_i387(struct _fpstate_64 *fxsave, + const struct _fpstate_32 *from); +#else +#define _xstate_64 _xstate +#endif + +static struct _fpstate *get_fpstate(struct stub_data *data, + mcontext_t *mcontext, + int *fp_size) +{ + struct _fpstate *res; + + /* Assume floating point registers are on the same page */ + res = (void *)(((unsigned long)mcontext->fpregs & + (UM_KERN_PAGE_SIZE - 1)) + + (unsigned long)&data->sigstack[0]); + + if ((void *)res + sizeof(struct _fpstate) > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + if (res->sw_reserved.magic1 != FP_XSTATE_MAGIC1) { + *fp_size = sizeof(struct _fpstate); + } else { + char *magic2_addr; + + magic2_addr = (void *)res; + magic2_addr += res->sw_reserved.extended_size; + magic2_addr -= FP_XSTATE_MAGIC2_SIZE; + + /* We still need to be within our stack */ + if ((void *)magic2_addr > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + /* If we do not read MAGIC2, then we did something wrong */ + if (*(__u32 *)magic2_addr != FP_XSTATE_MAGIC2) + return NULL; + + /* Remove MAGIC2 from the size, we do not save/restore it */ + *fp_size = res->sw_reserved.extended_size - + FP_XSTATE_MAGIC2_SIZE; + } + + return res; +} + +int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + get_regs_from_mc(regs, mcontext); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + if (fp_size_out) + *fp_size_out = xstate_size; + + if (xstate_size > host_fp_size) + return -ENOSPC; + + memcpy(®s->fp, xstate_stub, xstate_size); + + /* We do not need to read the x86_64 FS_BASE/GS_BASE registers as + * we do not permit userspace to set them directly. + */ + +#ifdef CONFIG_X86_32 + /* Read the i387 legacy FP registers */ + if (um_fxsr_from_i387((void *)®s->fp, fpstate_stub)) + return -EINVAL; +#endif + + return 0; +} + +/* Copied because we cannot include regset.h here. */ +struct task_struct; +struct user_regset; +struct membuf { + void *p; + size_t left; +}; + +int fpregs_legacy_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to); + +int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + if ((unsigned long)mcontext < (unsigned long)data->sigstack || + (unsigned long)mcontext > + (unsigned long) data->sigstack + + sizeof(data->sigstack) - sizeof(*mcontext)) + return -EINVAL; + + get_mc_from_regs(regs, mcontext, single_stepping); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + memcpy(xstate_stub, ®s->fp, xstate_size); + +#ifdef __i386__ + /* + * On x86, the GDT entries are updated by arch_set_tls. + */ + + /* Store the i387 legacy FP registers which the host will use */ + if (um_i387_from_fxsr(fpstate_stub, (void *)®s->fp)) + return -EINVAL; +#else + /* + * On x86_64, we need to sync the FS_BASE/GS_BASE registers using the + * arch specific data. + */ + if (data->arch_data.fs_base != regs->gp[FS_BASE / sizeof(unsigned long)]) { + data->arch_data.fs_base = regs->gp[FS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_FS_BASE; + } + if (data->arch_data.gs_base != regs->gp[GS_BASE / sizeof(unsigned long)]) { + data->arch_data.gs_base = regs->gp[GS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_GS_BASE; + } +#endif + + return 0; +} diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 57c504fd5626..2635ca2595a3 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -25,7 +25,8 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +static inline unsigned long +twd_fxsr_to_i387(const struct user_fxsr_struct *fxsave) { struct _fpxreg *st = NULL; unsigned long twd = (unsigned long) fxsave->twd; @@ -69,12 +70,16 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) return ret; } -/* Get/set the old 32bit i387 registers (pre-FPX) */ -static int fpregs_legacy_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) +/* + * Get/set the old 32bit i387 registers (pre-FPX) + * + * We provide simple wrappers for mcontext.c, they are only defined locally + * because mcontext.c is userspace facing and needs to a different definition + * of the structures. + */ +static int _um_i387_from_fxsr(struct membuf to, + const struct user_fxsr_struct *fxsave) { - struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; int i; membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul); @@ -91,23 +96,36 @@ static int fpregs_legacy_get(struct task_struct *target, return 0; } -static int fpregs_legacy_set(struct task_struct *target, +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave); + +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave) +{ + struct membuf to = { + .p = i387, + .left = sizeof(*i387), + }; + + return _um_i387_from_fxsr(to, fxsave); +} + +static int fpregs_legacy_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) { struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; - const struct user_i387_struct *from; - struct user_i387_struct buf; - int i; - if (ubuf) { - if (copy_from_user(&buf, ubuf, sizeof(buf))) - return -EFAULT; - from = &buf; - } else { - from = kbuf; - } + return _um_i387_from_fxsr(to, fxsave); +} + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from); + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from) +{ + int i; fxsave->cwd = (unsigned short)(from->cwd & 0xffff); fxsave->swd = (unsigned short)(from->swd & 0xffff); @@ -125,6 +143,26 @@ static int fpregs_legacy_set(struct task_struct *target, return 0; } + +static int fpregs_legacy_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; + const struct user_i387_struct *from; + struct user_i387_struct buf; + + if (ubuf) { + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + from = &buf; + } else { + from = kbuf; + } + + return um_fxsr_from_i387(fxsave, from); +} #endif static int genregs_get(struct task_struct *target, @@ -198,7 +236,7 @@ static int generic_fpregs_set(struct task_struct *target, static struct user_regset uml_regsets[] __ro_after_init = { [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -208,7 +246,7 @@ static struct user_regset uml_regsets[] __ro_after_init = { #ifdef CONFIG_X86_32 /* Old FP registers, they are needed in signal frames */ [REGSET_FP_LEGACY] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_i387_ia32_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -219,10 +257,10 @@ static struct user_regset uml_regsets[] __ro_after_init = { #endif [REGSET_FP] = { #ifdef CONFIG_X86_32 - .core_note_type = NT_PRXFPREG, + USER_REGSET_NOTE_TYPE(PRXFPREG), .n = sizeof(struct user32_fxsr_struct) / sizeof(long), #else - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_i387_struct) / sizeof(long), #endif .size = sizeof(long), @@ -232,7 +270,7 @@ static struct user_regset uml_regsets[] __ro_after_init = { .set = generic_fpregs_set, }, [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, + USER_REGSET_NOTE_TYPE(X86_XSTATE), .size = sizeof(long), .align = sizeof(long), .active = generic_fpregs_active, diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 48de3a71f845..6fd1ed400399 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -4,7 +4,9 @@ #include <linux/elf.h> #include <linux/crypto.h> #include <linux/kbuild.h> +#include <linux/audit.h> #include <asm/mman.h> +#include <asm/seccomp.h> /* workaround for a warning with -Wmissing-prototypes */ void foo(void); diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h index b724c54da316..6fe490cc5b98 100644 --- a/arch/x86/um/shared/sysdep/mcontext.h +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -6,7 +6,16 @@ #ifndef __SYS_SIGCONTEXT_X86_H #define __SYS_SIGCONTEXT_X86_H +#include <stub-data.h> + extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); +extern void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, + int single_stepping); + +extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out); +extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping); #ifdef __i386__ diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 8f7476ff6e95..572ea2d79131 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -44,18 +44,6 @@ #include "ptrace_64.h" #endif -struct syscall_args { - unsigned long args[6]; -}; - -#define SYSCALL_ARGS(r) ((struct syscall_args) \ - { .args = { UPT_SYSCALL_ARG1(r), \ - UPT_SYSCALL_ARG2(r), \ - UPT_SYSCALL_ARG3(r), \ - UPT_SYSCALL_ARG4(r), \ - UPT_SYSCALL_ARG5(r), \ - UPT_SYSCALL_ARG6(r) } } ) - extern unsigned long host_fp_size; struct uml_pt_regs { diff --git a/arch/x86/um/shared/sysdep/stub-data.h b/arch/x86/um/shared/sysdep/stub-data.h new file mode 100644 index 000000000000..82b1b7f8ac3d --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub-data.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_STUB_DATA_H +#define __ARCH_STUB_DATA_H + +#ifdef __i386__ +#include <generated/asm-offsets.h> +#include <asm/ldt.h> + +struct stub_data_arch { + int sync; + struct user_desc tls[UM_KERN_GDT_ENTRY_TLS_ENTRIES]; +}; +#else +#define STUB_SYNC_FS_BASE (1 << 0) +#define STUB_SYNC_GS_BASE (1 << 1) +struct stub_data_arch { + int sync; + unsigned long fs_base; + unsigned long gs_base; +}; +#endif + +#endif /* __ARCH_STUB_DATA_H */ diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index dc89f4423454..4fa58f5b4fca 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -13,3 +13,5 @@ extern void stub_segv_handler(int, siginfo_t *, void *); extern void stub_syscall_handler(void); +extern void stub_signal_interrupt(int, siginfo_t *, void *); +extern void stub_signal_restorer(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 390988132c0a..df568fc3ceb4 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -131,4 +131,17 @@ static __always_inline void *get_stub_data(void) "call *%%eax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + for (int i = 0; i < sizeof(arch->tls) / sizeof(arch->tls[0]); i++) { + if (arch->sync & (1 << i)) + stub_syscall1(__NR_set_thread_area, + (unsigned long) &arch->tls[i]); + } + + arch->sync = 0; +} + #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 294affbec742..9cfd31afa769 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -10,6 +10,7 @@ #include <sysdep/ptrace_user.h> #include <generated/asm-offsets.h> #include <linux/stddef.h> +#include <asm/prctl.h> #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) @@ -134,4 +135,20 @@ static __always_inline void *get_stub_data(void) "call *%%rax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + /* + * We could use _writefsbase_u64/_writegsbase_u64 if the host reports + * support in the hwcaps (HWCAP2_FSGSBASE). + */ + if (arch->sync & STUB_SYNC_FS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_FS, arch->fs_base); + if (arch->sync & STUB_SYNC_GS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_GS, arch->gs_base); + + arch->sync = 0; +} + #endif diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h deleted file mode 100644 index b2060ac707f0..000000000000 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef __i386__ -#include "syscalls_32.h" -#else -#include "syscalls_64.h" -#endif diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h deleted file mode 100644 index f6e9f84397e7..000000000000 --- a/arch/x86/um/shared/sysdep/syscalls_32.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <asm/unistd.h> -#include <sysdep/ptrace.h> - -typedef long syscall_handler_t(struct syscall_args); - -extern syscall_handler_t *sys_call_table[]; - -#define EXECUTE_SYSCALL(syscall, regs) \ - ((*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h deleted file mode 100644 index b6b997225841..000000000000 --- a/arch/x86/um/shared/sysdep/syscalls_64.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2003 PathScale, Inc. - * - * Licensed under the GPL - */ - -#ifndef __SYSDEP_X86_64_SYSCALLS_H__ -#define __SYSDEP_X86_64_SYSCALLS_H__ - -#include <linux/msg.h> -#include <linux/shm.h> - -typedef long syscall_handler_t(long, long, long, long, long, long); - -extern syscall_handler_t *sys_call_table[]; - -#define EXECUTE_SYSCALL(syscall, regs) \ - (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ - UPT_SYSCALL_ARG2(®s->regs), \ - UPT_SYSCALL_ARG3(®s->regs), \ - UPT_SYSCALL_ARG4(®s->regs), \ - UPT_SYSCALL_ARG5(®s->regs), \ - UPT_SYSCALL_ARG6(®s->regs))) - -extern syscall_handler_t sys_modify_ldt; -extern syscall_handler_t sys_arch_prctl; - -#endif diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index fbb129023080..1909c2e640b2 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -12,6 +12,7 @@ #include <skas.h> #include <sysdep/tls.h> #include <asm/desc.h> +#include <stub-data.h> /* * If needed we can detect when it's uninitialized. @@ -21,14 +22,25 @@ static int host_supports_tls = -1; int host_gdt_entry_tls_min; -static int do_set_thread_area(struct user_desc *info) +static int do_set_thread_area(struct task_struct* task, struct user_desc *info) { int ret; - u32 cpu; - cpu = get_cpu(); - ret = os_set_thread_area(info, userspace_pid[cpu]); - put_cpu(); + if (info->entry_number < host_gdt_entry_tls_min || + info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES) + return -EINVAL; + + if (using_seccomp) { + int idx = info->entry_number - host_gdt_entry_tls_min; + struct stub_data *data = (void *)task->mm->context.id.stack; + + data->arch_data.tls[idx] = *info; + data->arch_data.sync |= BIT(idx); + + return 0; + } + + ret = os_set_thread_area(info, task->mm->context.id.pid); if (ret) printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " @@ -97,7 +109,7 @@ static int load_TLS(int flags, struct task_struct *to) if (!(flags & O_FORCE) && curr->flushed) continue; - ret = do_set_thread_area(&curr->tls); + ret = do_set_thread_area(current, &curr->tls); if (ret) goto out; @@ -174,7 +186,7 @@ int arch_switch_tls(struct task_struct *to) /* * We have no need whatsoever to switch TLS for kernel threads; beyond * that, that would also result in us calling os_set_thread_area with - * userspace_pid[cpu] == 0, which gives an error. + * task->mm == NULL, which would cause a crash. */ if (likely(to->mm)) return load_TLS(O_FORCE, to); @@ -275,7 +287,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc) return -EFAULT; } - ret = do_set_thread_area(&info); + ret = do_set_thread_area(current, &info); if (ret) return ret; return set_tls_entry(current, &info, idx, 1); diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S index 5b1f2286aea9..6854c52c374b 100644 --- a/arch/x86/virt/vmx/tdx/seamcall.S +++ b/arch/x86/virt/vmx/tdx/seamcall.S @@ -41,6 +41,9 @@ SYM_FUNC_START(__seamcall_ret) TDX_MODULE_CALL host=1 ret=1 SYM_FUNC_END(__seamcall_ret) +/* KVM requires non-instrumentable __seamcall_saved_ret() for TDH.VP.ENTER */ +.section .noinstr.text, "ax" + /* * __seamcall_saved_ret() - Host-side interface functions to SEAM software * (the P-SEAMLDR or the TDX module), with saving output registers to the diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 7fdb37387886..c7a9a087ccaf 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -5,6 +5,7 @@ * Intel Trusted Domain Extensions (TDX) support */ +#include "asm/page_types.h" #define pr_fmt(fmt) "virt/tdx: " fmt #include <linux/types.h> @@ -27,6 +28,7 @@ #include <linux/log2.h> #include <linux/acpi.h> #include <linux/suspend.h> +#include <linux/idr.h> #include <asm/page.h> #include <asm/special_insns.h> #include <asm/msr-index.h> @@ -42,6 +44,8 @@ static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; +static DEFINE_IDA(tdx_guest_keyid_pool); + static DEFINE_PER_CPU(bool, tdx_lp_initialized); static struct tdmr_info_list tdx_tdmr_list; @@ -52,6 +56,8 @@ static DEFINE_MUTEX(tdx_module_lock); /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ static LIST_HEAD(tdx_memlist); +static struct tdx_sys_info tdx_sysinfo; + typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) @@ -69,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err, args->r9, args->r10, args->r11); } -static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, - u64 fn, struct tdx_module_args *args) +static __always_inline int sc_retry_prerr(sc_func_t func, + sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) { u64 sret = sc_retry(func, fn, args); @@ -1060,15 +1067,14 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) static int init_tdx_module(void) { - struct tdx_sys_info sysinfo; int ret; - ret = get_tdx_sys_info(&sysinfo); + ret = get_tdx_sys_info(&tdx_sysinfo); if (ret) return ret; /* Check whether the kernel can support this module */ - ret = check_features(&sysinfo); + ret = check_features(&tdx_sysinfo); if (ret) return ret; @@ -1089,12 +1095,12 @@ static int init_tdx_module(void) goto out_put_tdxmem; /* Allocate enough space for constructing TDMRs */ - ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr); + ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdxmem; /* Cover all TDX-usable memory regions in TDMRs */ - ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr); + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdmrs; @@ -1456,3 +1462,411 @@ void __init tdx_init(void) check_tdx_erratum(); } + +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + +u32 tdx_get_nr_guest_keyids(void) +{ + return tdx_nr_guest_keyids; +} +EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); + +int tdx_guest_keyid_alloc(void) +{ + return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, + tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); + +void tdx_guest_keyid_free(unsigned int keyid) +{ + ida_free(&tdx_guest_keyid_pool, keyid); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); + +static inline u64 tdx_tdr_pa(struct tdx_td *td) +{ + return page_to_phys(td->tdr_page); +} + +static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) +{ + return page_to_phys(td->tdvpr_page); +} + +/* + * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether + * a CLFLUSH of pages is required before handing them to the TDX module. + * Be conservative and make the code simpler by doing the CLFLUSH + * unconditionally. + */ +static void tdx_clflush_page(struct page *page) +{ + clflush_cache_range(page_to_virt(page), PAGE_SIZE); +} + +noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) +{ + args->rcx = tdx_tdvpr_pa(td); + + return __seamcall_saved_ret(TDH_VP_ENTER, args); +} +EXPORT_SYMBOL_GPL(tdh_vp_enter); + +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcs_page), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(tdcs_page); + return seamcall(TDH_MNG_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_addcx); + +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + .r9 = page_to_phys(source), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_add); + +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_sept_add); + +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcx_page), + .rdx = tdx_tdvpr_pa(vp), + }; + + tdx_clflush_page(tdcx_page); + return seamcall(TDH_VP_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_addcx); + +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_aug); + +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_range_block); + +u64 tdh_mng_key_config(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_CONFIG, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_config); + +u64 tdh_mng_create(struct tdx_td *td, u16 hkid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = hkid, + }; + + tdx_clflush_page(td->tdr_page); + return seamcall(TDH_MNG_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_create); + +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(vp->tdvpr_page); + return seamcall(TDH_VP_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_create); + +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_rd); + +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MR_EXTEND, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mr_extend); + +u64 tdh_mr_finalize(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MR_FINALIZE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mr_finalize); + +u64 tdh_vp_flush(struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + }; + + return seamcall(TDH_VP_FLUSH, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_flush); + +u64 tdh_mng_vpflushdone(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_VPFLUSHDONE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); + +u64 tdh_mng_key_freeid(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_FREEID, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); + +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = td_params, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_INIT, &args); + + *extended_err = args.rcx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_init); + +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_VP_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_vp_rd); + +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + .r8 = data, + .r9 = mask, + }; + + return seamcall(TDH_VP_WR, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_wr); + +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = initial_rcx, + .r8 = x2apicid, + }; + + /* apicid requires version == 1. */ + return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_init); + +/* + * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. + * So despite the names, they must be interpted specially as described by the spec. Return + * them only for error reporting purposes. + */ +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(page), + }; + u64 ret; + + ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); + + *tdx_pt = args.rcx; + *tdx_owner = args.rdx; + *tdx_size = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); + +u64 tdh_mem_track(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MEM_TRACK, &args); +} +EXPORT_SYMBOL_GPL(tdh_mem_track); + +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_remove); + +u64 tdh_phymem_cache_wb(bool resume) +{ + struct tdx_module_args args = { + .rcx = resume ? 1 : 0, + }; + + return seamcall(TDH_PHYMEM_CACHE_WB, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); + +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); + +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(hkid, page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 4e3d533cdd61..82bb82be8567 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include <linux/bits.h> -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX @@ -15,13 +14,46 @@ /* * TDX module SEAMCALL leaf functions */ -#define TDH_PHYMEM_PAGE_RDMD 24 -#define TDH_SYS_KEY_CONFIG 31 -#define TDH_SYS_INIT 33 -#define TDH_SYS_RD 34 -#define TDH_SYS_LP_INIT 35 -#define TDH_SYS_TDMR_INIT 36 -#define TDH_SYS_CONFIG 45 +#define TDH_VP_ENTER 0 +#define TDH_MNG_ADDCX 1 +#define TDH_MEM_PAGE_ADD 2 +#define TDH_MEM_SEPT_ADD 3 +#define TDH_VP_ADDCX 4 +#define TDH_MEM_PAGE_AUG 6 +#define TDH_MEM_RANGE_BLOCK 7 +#define TDH_MNG_KEY_CONFIG 8 +#define TDH_MNG_CREATE 9 +#define TDH_MNG_RD 11 +#define TDH_MR_EXTEND 16 +#define TDH_MR_FINALIZE 17 +#define TDH_VP_FLUSH 18 +#define TDH_MNG_VPFLUSHDONE 19 +#define TDH_VP_CREATE 10 +#define TDH_MNG_KEY_FREEID 20 +#define TDH_MNG_INIT 21 +#define TDH_VP_INIT 22 +#define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_VP_RD 26 +#define TDH_PHYMEM_PAGE_RECLAIM 28 +#define TDH_MEM_PAGE_REMOVE 29 +#define TDH_SYS_KEY_CONFIG 31 +#define TDH_SYS_INIT 33 +#define TDH_SYS_RD 34 +#define TDH_SYS_LP_INIT 35 +#define TDH_SYS_TDMR_INIT 36 +#define TDH_MEM_TRACK 38 +#define TDH_PHYMEM_CACHE_WB 40 +#define TDH_PHYMEM_PAGE_WBINVD 41 +#define TDH_VP_WR 43 +#define TDH_SYS_CONFIG 45 + +/* + * SEAMCALL leaf: + * + * Bit 15:0 Leaf number + * Bit 23:16 Version number + */ +#define TDX_VERSION_SHIFT 16 /* TDX page types */ #define PT_NDA 0x0 diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c index 8027a24d1c6e..13ad2663488b 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -37,12 +37,62 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) return ret; } +static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000000, &val))) + sysinfo_td_ctrl->tdr_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000100, &val))) + sysinfo_td_ctrl->tdcs_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000200, &val))) + sysinfo_td_ctrl->tdvps_base_size = val; + + return ret; +} + +static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) +{ + int ret = 0; + u64 val; + int i, j; + + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000000, &val))) + sysinfo_td_conf->attributes_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000001, &val))) + sysinfo_td_conf->attributes_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000002, &val))) + sysinfo_td_conf->xfam_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000003, &val))) + sysinfo_td_conf->xfam_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000004, &val))) + sysinfo_td_conf->num_cpuid_config = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000008, &val))) + sysinfo_td_conf->max_vcpus_per_td = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_leaves)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000400 + i, &val))) + sysinfo_td_conf->cpuid_config_leaves[i] = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_values)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + for (j = 0; j < 2; j++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000500 + i * 2 + j, &val))) + sysinfo_td_conf->cpuid_config_values[i][j] = val; + + return ret; +} + static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) { int ret = 0; ret = ret ?: get_tdx_sys_info_features(&sysinfo->features); ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr); + ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl); + ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf); return ret; } |