diff options
Diffstat (limited to 'arch/x86/virt')
-rw-r--r-- | arch/x86/virt/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/virt/svm/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/virt/svm/cmdline.c | 45 | ||||
-rw-r--r-- | arch/x86/virt/svm/sev.c | 1072 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.c | 1458 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.h | 89 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 48 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 25 |
9 files changed, 2743 insertions, 2 deletions
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile index 1e36502cd738..ea343fc392dc 100644 --- a/arch/x86/virt/Makefile +++ b/arch/x86/virt/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += vmx/ +obj-y += svm/ vmx/ diff --git a/arch/x86/virt/svm/Makefile b/arch/x86/virt/svm/Makefile new file mode 100644 index 000000000000..eca6d71355fa --- /dev/null +++ b/arch/x86/virt/svm/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KVM_AMD_SEV) += sev.o +obj-$(CONFIG_CPU_SUP_AMD) += cmdline.o diff --git a/arch/x86/virt/svm/cmdline.c b/arch/x86/virt/svm/cmdline.c new file mode 100644 index 000000000000..affa2759fa20 --- /dev/null +++ b/arch/x86/virt/svm/cmdline.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM-SEV command line parsing support + * + * Copyright (C) 2023 - 2024 Advanced Micro Devices, Inc. + * + * Author: Michael Roth <michael.roth@amd.com> + */ + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/cache.h> +#include <linux/cpufeature.h> + +#include <asm/sev-common.h> + +struct sev_config sev_cfg __read_mostly; + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + if (!strcmp(s, "nosnp")) { + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + continue; + } else { + goto warn; + } + } + +warn: + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c new file mode 100644 index 000000000000..42e74a5a7d78 --- /dev/null +++ b/arch/x86/virt/svm/sev.c @@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM-SEV Host Support. + * + * Copyright (C) 2023 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + * + */ + +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/iommu.h> +#include <linux/amd-iommu.h> +#include <linux/nospec.h> + +#include <asm/sev.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> +#include <asm/iommu.h> + +/* + * The RMP entry information as returned by the RMPREAD instruction. + */ +struct rmpentry { + u64 gpa; + u8 assigned :1, + rsvd1 :7; + u8 pagesize :1, + hpage_region_status :1, + rsvd2 :6; + u8 immutable :1, + rsvd3 :7; + u8 rsvd4; + u32 asid; +} __packed; + +/* + * The raw RMP entry format is not architectural. The format is defined in PPR + * Family 19h Model 01h, Rev B1 processor. This format represents the actual + * entry in the RMP table memory. The bitfield definitions are used for machines + * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo" + * fields are only used for dumping the raw data. + */ +struct rmpentry_raw { + union { + struct { + u64 assigned : 1, + pagesize : 1, + immutable : 1, + rsvd1 : 9, + gpa : 39, + asid : 10, + vmsa : 1, + validated : 1, + rsvd2 : 1; + }; + u64 lo; + }; + u64 hi; +} __packed; + +/* + * The first 16KB from the RMP_BASE is used by the processor for the + * bookkeeping, the range needs to be added during the RMP entry lookup. + */ +#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 + +/* + * For a non-segmented RMP table, use the maximum physical addressing as the + * segment size in order to always arrive at index 0 in the table. + */ +#define RMPTABLE_NON_SEGMENTED_SHIFT 52 + +struct rmp_segment_desc { + struct rmpentry_raw *rmp_entry; + u64 max_index; + u64 size; +}; + +/* + * Segmented RMP Table support. + * - The segment size is used for two purposes: + * - Identify the amount of memory covered by an RMP segment + * - Quickly locate an RMP segment table entry for a physical address + * + * - The RMP segment table contains pointers to an RMP table that covers + * a specific portion of memory. There can be up to 512 8-byte entries, + * one pages worth. + */ +#define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0)) +#define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20)) + +#define RST_SIZE SZ_4K +static struct rmp_segment_desc **rmp_segment_table __ro_after_init; +static unsigned int rst_max_index __ro_after_init = 512; + +static unsigned int rmp_segment_shift; +static u64 rmp_segment_size; +static u64 rmp_segment_mask; + +#define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) +#define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) + +static u64 rmp_cfg; + +/* Mask to apply to a PFN to get the first PFN of a 2MB page */ +#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) + +static u64 probed_rmp_base, probed_rmp_size; + +static LIST_HEAD(snp_leaked_pages_list); +static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); + +static unsigned long snp_nr_leaked_pages; + +#undef pr_fmt +#define pr_fmt(fmt) "SEV-SNP: " fmt + +static int __mfd_enable(unsigned int cpu) +{ + u64 val; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_MFDM; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void mfd_enable(void *arg) +{ + __mfd_enable(smp_processor_id()); +} + +static int __snp_enable(unsigned int cpu) +{ + u64 val; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_SNP_EN; + val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void snp_enable(void *arg) +{ + __snp_enable(smp_processor_id()); +} + +static void __init __snp_fixup_e820_tables(u64 pa) +{ + if (IS_ALIGNED(pa, PMD_SIZE)) + return; + + /* + * Handle cases where the RMP table placement by the BIOS is not + * 2M aligned and the kexec kernel could try to allocate + * from within that chunk which then causes a fatal RMP fault. + * + * The e820_table needs to be updated as it is converted to + * kernel memory resources and used by KEXEC_FILE_LOAD syscall + * to load kexec segments. + * + * The e820_table_firmware needs to be updated as it is exposed + * to sysfs and used by the KEXEC_LOAD syscall to load kexec + * segments. + * + * The e820_table_kexec needs to be updated as it passed to + * the kexec-ed kernel. + */ + pa = ALIGN_DOWN(pa, PMD_SIZE); + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + if (!memblock_is_region_reserved(pa, PMD_SIZE)) + memblock_reserve(pa, PMD_SIZE); + } +} + +static void __init fixup_e820_tables_for_segmented_rmp(void) +{ + u64 pa, *rst, size, mapped_size; + unsigned int i; + + __snp_fixup_e820_tables(probed_rmp_base); + + pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + + __snp_fixup_e820_tables(pa + RST_SIZE); + + rst = early_memremap(pa, RST_SIZE); + if (!rst) + return; + + for (i = 0; i < rst_max_index; i++) { + pa = RST_ENTRY_SEGMENT_BASE(rst[i]); + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + __snp_fixup_e820_tables(pa); + + /* + * Mapped size in GB. Mapped size is allowed to exceed + * the segment coverage size, but gets reduced to the + * segment coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) + mapped_size = rmp_segment_size; + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + size = PHYS_PFN(mapped_size) << 4; + + __snp_fixup_e820_tables(pa + size); + } + + early_memunmap(rst, RST_SIZE); +} + +static void __init fixup_e820_tables_for_contiguous_rmp(void) +{ + __snp_fixup_e820_tables(probed_rmp_base); + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); +} + +void __init snp_fixup_e820_tables(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + fixup_e820_tables_for_segmented_rmp(); + } else { + fixup_e820_tables_for_contiguous_rmp(); + } +} + +static bool __init clear_rmptable_bookkeeping(void) +{ + void *bk; + + bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB); + if (!bk) { + pr_err("Failed to map RMP bookkeeping area\n"); + return false; + } + + memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ); + + memunmap(bk); + + return true; +} + +static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa) +{ + u64 rst_index, rmp_segment_size_max; + struct rmp_segment_desc *desc; + void *rmp_segment; + + /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */ + rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4; + + /* Validate the RMP segment size */ + if (segment_size > rmp_segment_size_max) { + pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n", + segment_size, rmp_segment_size_max); + return false; + } + + /* Validate the RMP segment table index */ + rst_index = RST_ENTRY_INDEX(pa); + if (rst_index >= rst_max_index) { + pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n", + pa, rmp_segment_size); + return false; + } + + if (rmp_segment_table[rst_index]) { + pr_err("RMP segment descriptor already exists at index %llu\n", rst_index); + return false; + } + + rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB); + if (!rmp_segment) { + pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n", + segment_pa, segment_size); + return false; + } + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + memunmap(rmp_segment); + return false; + } + + desc->rmp_entry = rmp_segment; + desc->max_index = segment_size / sizeof(*desc->rmp_entry); + desc->size = segment_size; + + rmp_segment_table[rst_index] = desc; + + return true; +} + +static void __init free_rmp_segment_table(void) +{ + unsigned int i; + + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memunmap(desc->rmp_entry); + + kfree(desc); + } + + free_page((unsigned long)rmp_segment_table); + + rmp_segment_table = NULL; +} + +/* Allocate the table used to index into the RMP segments */ +static bool __init alloc_rmp_segment_table(void) +{ + struct page *page; + + page = alloc_page(__GFP_ZERO); + if (!page) + return false; + + rmp_segment_table = page_address(page); + + return true; +} + +static bool __init setup_contiguous_rmptable(void) +{ + u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end; + + if (!probed_rmp_size) + return false; + + rmp_end = probed_rmp_base + probed_rmp_size - 1; + + /* + * Calculate the amount of memory that must be reserved by the BIOS to + * address the whole RAM, including the bookkeeping area. The RMP itself + * must also be covered. + */ + max_rmp_pfn = max_pfn; + if (PFN_UP(rmp_end) > max_pfn) + max_rmp_pfn = PFN_UP(rmp_end); + + calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; + if (calc_rmp_sz > probed_rmp_size) { + pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + calc_rmp_sz, probed_rmp_size); + return false; + } + + if (!alloc_rmp_segment_table()) + return false; + + /* Map only the RMP entries */ + rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; + + if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { + free_rmp_segment_table(); + return false; + } + + return true; +} + +static bool __init setup_segmented_rmptable(void) +{ + u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max; + unsigned int i, max_index; + + if (!probed_rmp_base) + return false; + + if (!alloc_rmp_segment_table()) + return false; + + rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB); + if (!rst) { + pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa); + goto e_free; + } + + pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30); + + ram_pa_max = max_pfn << PAGE_SHIFT; + + max_index = 0; + ram_pa_end = 0; + for (i = 0; i < rst_max_index; i++) { + u64 rmp_segment, rmp_size, mapped_size; + + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + max_index = i; + + /* + * Mapped size in GB. Mapped size is allowed to exceed the + * segment coverage size, but gets reduced to the segment + * coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) { + pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n", + i, mapped_size, rmp_segment_size); + mapped_size = rmp_segment_size; + } + + rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]); + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + rmp_size = PHYS_PFN(mapped_size) << 4; + + pa = (u64)i << rmp_segment_shift; + + /* + * Some segments may be for MMIO mapped above system RAM. These + * segments are used for Trusted I/O. + */ + if (pa < ram_pa_max) + ram_pa_end = pa + mapped_size; + + if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa)) + goto e_unmap; + + pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n", + i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1); + } + + if (ram_pa_max > ram_pa_end) { + pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + ram_pa_max, ram_pa_end); + goto e_unmap; + } + + /* Adjust the maximum index based on the found segments */ + rst_max_index = max_index + 1; + + memunmap(rst); + + return true; + +e_unmap: + memunmap(rst); + +e_free: + free_rmp_segment_table(); + + return false; +} + +static bool __init setup_rmptable(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + return setup_segmented_rmptable(); + } else { + return setup_contiguous_rmptable(); + } +} + +/* + * Do the necessary preparations which are verified by the firmware as + * described in the SNP_INIT_EX firmware command description in the SNP + * firmware ABI spec. + */ +int __init snp_rmptable_init(void) +{ + unsigned int i; + u64 val; + + if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) + return -ENOSYS; + + if (WARN_ON_ONCE(!amd_iommu_snp_en)) + return -ENOSYS; + + if (!setup_rmptable()) + return -ENOSYS; + + /* + * Check if SEV-SNP is already enabled, this can happen in case of + * kexec boot. + */ + rdmsrl(MSR_AMD64_SYSCFG, val); + if (val & MSR_AMD64_SYSCFG_SNP_EN) + goto skip_enable; + + /* Zero out the RMP bookkeeping area */ + if (!clear_rmptable_bookkeeping()) { + free_rmp_segment_table(); + return -ENOSYS; + } + + /* Zero out the RMP entries */ + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memset(desc->rmp_entry, 0, desc->size); + } + + /* Flush the caches to ensure that data is written before SNP is enabled. */ + wbinvd_on_all_cpus(); + + /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */ + on_each_cpu(mfd_enable, NULL, 1); + + on_each_cpu(snp_enable, NULL, 1); + +skip_enable: + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); + + /* + * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic + * notifier is invoked to do SNP IOMMU shutdown before kdump. + */ + crash_kexec_post_notifiers = true; + + return 0; +} + +static void set_rmp_segment_info(unsigned int segment_shift) +{ + rmp_segment_shift = segment_shift; + rmp_segment_size = 1ULL << rmp_segment_shift; + rmp_segment_mask = rmp_segment_size - 1; +} + +#define RMP_ADDR_MASK GENMASK_ULL(51, 13) + +static bool probe_contiguous_rmptable_info(void) +{ + u64 rmp_sz, rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + + if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + if (rmp_base > rmp_end) { + pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); + return false; + } + + rmp_sz = rmp_end - rmp_base + 1; + + /* Treat the contiguous RMP table as a single segment */ + rst_max_index = 1; + + set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT); + + probed_rmp_base = rmp_base; + probed_rmp_size = rmp_sz; + + pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_end); + + return true; +} + +static bool probe_segmented_rmptable_info(void) +{ + unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; + u64 rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + if (!(rmp_base & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + WARN_ONCE(rmp_end & RMP_ADDR_MASK, + "Segmented RMP enabled but RMP_END MSR is non-zero\n"); + + /* Obtain the min and max supported RMP segment size */ + eax = cpuid_eax(0x80000025); + segment_shift_min = eax & GENMASK(5, 0); + segment_shift_max = (eax & GENMASK(11, 6)) >> 6; + + /* Verify the segment size is within the supported limits */ + segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg); + if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) { + pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n", + segment_shift, segment_shift_min, segment_shift_max); + return false; + } + + /* Override the max supported RST index if a hardware limit exists */ + ebx = cpuid_ebx(0x80000025); + if (ebx & BIT(10)) + rst_max_index = ebx & GENMASK(9, 0); + + set_rmp_segment_info(segment_shift); + + probed_rmp_base = rmp_base; + probed_rmp_size = 0; + + pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE); + + return true; +} + +bool snp_probe_rmptable_info(void) +{ + if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) + rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg); + + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) + return probe_segmented_rmptable_info(); + else + return probe_contiguous_rmptable_info(); +} + +/* + * About the array_index_nospec() usage below: + * + * This function can get called by exported functions like + * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among + * others, and since the @pfn passed in cannot always be trusted, + * speculation should be stopped as a protective measure. + */ +static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) +{ + u64 paddr, rst_index, segment_index; + struct rmp_segment_desc *desc; + + if (!rmp_segment_table) + return ERR_PTR(-ENODEV); + + paddr = pfn << PAGE_SHIFT; + + rst_index = RST_ENTRY_INDEX(paddr); + if (unlikely(rst_index >= rst_max_index)) + return ERR_PTR(-EFAULT); + + rst_index = array_index_nospec(rst_index, rst_max_index); + + desc = rmp_segment_table[rst_index]; + if (unlikely(!desc)) + return ERR_PTR(-EFAULT); + + segment_index = RMP_ENTRY_INDEX(paddr); + if (unlikely(segment_index >= desc->max_index)) + return ERR_PTR(-EFAULT); + + segment_index = array_index_nospec(segment_index, desc->max_index); + + return desc->rmp_entry + segment_index; +} + +static int get_rmpentry(u64 pfn, struct rmpentry *e) +{ + struct rmpentry_raw *e_raw; + + if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) { + int ret; + + /* Binutils version 2.44 supports the RMPREAD mnemonic. */ + asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd" + : "=a" (ret) + : "a" (pfn << PAGE_SHIFT), "c" (e) + : "memory", "cc"); + + return ret; + } + + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) + return PTR_ERR(e_raw); + + /* + * Map the raw RMP table entry onto the RMPREAD output format. + * The 2MB region status indicator (hpage_region_status field) is not + * calculated, since the overhead could be significant and the field + * is not used. + */ + memset(e, 0, sizeof(*e)); + e->gpa = e_raw->gpa << PAGE_SHIFT; + e->asid = e_raw->asid; + e->assigned = e_raw->assigned; + e->pagesize = e_raw->pagesize; + e->immutable = e_raw->immutable; + + return 0; +} + +static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level) +{ + struct rmpentry e_large; + int ret; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + ret = get_rmpentry(pfn, e); + if (ret) + return ret; + + /* + * Find the authoritative RMP entry for a PFN. This can be either a 4K + * RMP entry or a special large RMP entry that is authoritative for a + * whole 2M area. + */ + ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large); + if (ret) + return ret; + + *level = RMP_TO_PG_LEVEL(e_large.pagesize); + + return 0; +} + +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) +{ + struct rmpentry e; + int ret; + + ret = __snp_lookup_rmpentry(pfn, &e, level); + if (ret) + return ret; + + *assigned = !!e.assigned; + return 0; +} +EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); + +/* + * Dump the raw RMP entry for a particular PFN. These bits are documented in the + * PPR for a particular CPU model and provide useful information about how a + * particular PFN is being utilized by the kernel/firmware at the time certain + * unexpected events occur, such as RMP faults. + */ +static void dump_rmpentry(u64 pfn) +{ + struct rmpentry_raw *e_raw; + u64 pfn_i, pfn_end; + struct rmpentry e; + int level, ret; + + ret = __snp_lookup_rmpentry(pfn, &e, &level); + if (ret) { + pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n", + pfn, ret); + return; + } + + if (e.assigned) { + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) { + pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n", + pfn, PTR_ERR(e_raw)); + return; + } + + pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", + pfn, e_raw->lo, e_raw->hi); + return; + } + + /* + * If the RMP entry for a particular PFN is not in an assigned state, + * then it is sometimes useful to get an idea of whether or not any RMP + * entries for other PFNs within the same 2MB region are assigned, since + * those too can affect the ability to access a particular PFN in + * certain situations, such as when the PFN is being accessed via a 2MB + * mapping in the host page table. + */ + pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); + pfn_end = pfn_i + PTRS_PER_PMD; + + pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", + pfn, pfn_i, pfn_end); + + while (pfn_i < pfn_end) { + e_raw = get_raw_rmpentry(pfn_i); + if (IS_ERR(e_raw)) { + pr_err("Error %ld reading RMP contents for PFN 0x%llx\n", + PTR_ERR(e_raw), pfn_i); + pfn_i++; + continue; + } + + if (e_raw->lo || e_raw->hi) + pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi); + pfn_i++; + } +} + +void snp_dump_hva_rmpentry(unsigned long hva) +{ + unsigned long paddr; + unsigned int level; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd += pgd_index(hva); + pte = lookup_address_in_pgd(pgd, hva, &level); + + if (!pte) { + pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); + return; + } + + paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); + dump_rmpentry(PHYS_PFN(paddr)); +} + +/* + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the + * Validated bit. + */ +int psmash(u64 pfn) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + if (!pfn_valid(pfn)) + return -EINVAL; + + /* Binutils version 2.36 supports the PSMASH mnemonic. */ + asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" + : "=a" (ret) + : "a" (paddr) + : "memory", "cc"); + + return ret; +} +EXPORT_SYMBOL_GPL(psmash); + +/* + * If the kernel uses a 2MB or larger directmap mapping to write to an address, + * and that mapping contains any 4KB pages that are set to private in the RMP + * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that + * owns the PFNs being transitioned will never attempt such a write, but other + * kernel tasks writing to other PFNs in the range may trigger these checks + * inadvertently due a large directmap mapping that happens to overlap such a + * PFN. + * + * Prevent this by splitting any 2MB+ mappings that might end up containing a + * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the + * PFN/rmp_level passed in. + * + * Note that there is no attempt here to scan all the RMP entries for the 2MB + * physical range, since it would only be worthwhile in determining if a + * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of + * the same shared/private state, thus avoiding the need to split the mapping. + * But that would mean the entries are currently in a mixed state, and so the + * mapping would have already been split as a result of prior transitions. + * And since the 4K split is only done if the mapping is 2MB+, and there isn't + * currently a mechanism in place to restore 2MB+ mappings, such a check would + * not provide any usable benefit. + * + * More specifics on how these checks are carried out can be found in APM + * Volume 2, "RMP and VMPL Access Checks". + */ +static int adjust_direct_map(u64 pfn, int rmp_level) +{ + unsigned long vaddr; + unsigned int level; + int npages, ret; + pte_t *pte; + + /* + * pfn_to_kaddr() will return a vaddr only within the direct + * map range. + */ + vaddr = (unsigned long)pfn_to_kaddr(pfn); + + /* Only 4KB/2MB RMP entries are supported by current hardware. */ + if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) + return -EINVAL; + + if (!pfn_valid(pfn)) + return -EINVAL; + + if (rmp_level == PG_LEVEL_2M && + (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) + return -EINVAL; + + /* + * If an entire 2MB physical range is being transitioned, then there is + * no risk of RMP #PFs due to write accesses from overlapping mappings, + * since even accesses from 1GB mappings will be treated as 2MB accesses + * as far as RMP table checks are concerned. + */ + if (rmp_level == PG_LEVEL_2M) + return 0; + + pte = lookup_address(vaddr, &level); + if (!pte || pte_none(*pte)) + return 0; + + if (level == PG_LEVEL_4K) + return 0; + + npages = page_level_size(rmp_level) / PAGE_SIZE; + ret = set_memory_4k(vaddr, npages); + if (ret) + pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", + pfn, ret); + + return ret; +} + +/* + * It is expected that those operations are seldom enough so that no mutual + * exclusion of updaters is needed and thus the overlap error condition below + * should happen very rarely and would get resolved relatively quickly by + * the firmware. + * + * If not, one could consider introducing a mutex or so here to sync concurrent + * RMP updates and thus diminish the amount of cases where firmware needs to + * lock 2M ranges to protect against concurrent updates. + * + * The optimal solution would be range locking to avoid locking disjoint + * regions unnecessarily but there's no support for that yet. + */ +static int rmpupdate(u64 pfn, struct rmp_state *state) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret, level; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + level = RMP_TO_PG_LEVEL(state->pagesize); + + if (adjust_direct_map(pfn, level)) + return -EFAULT; + + do { + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a" (ret) + : "a" (paddr), "c" ((unsigned long)state) + : "memory", "cc"); + } while (ret == RMPUPDATE_FAIL_OVERLAP); + + if (ret) { + pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", + pfn, level, ret); + dump_rmpentry(pfn); + dump_stack(); + return -EFAULT; + } + + return 0; +} + +/* Transition a page to guest-owned/private state in the RMP table. */ +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.assigned = 1; + state.asid = asid; + state.immutable = immutable; + state.gpa = gpa; + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_private); + +/* Transition a page to hypervisor-owned/shared state in the RMP table. */ +int rmp_make_shared(u64 pfn, enum pg_level level) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_shared); + +void snp_leak_pages(u64 pfn, unsigned int npages) +{ + struct page *page = pfn_to_page(pfn); + + pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); + + spin_lock(&snp_leaked_pages_list_lock); + while (npages--) { + + /* + * Reuse the page's buddy list for chaining into the leaked + * pages list. This page should not be on a free list currently + * and is also unsafe to be added to a free list. + */ + if (likely(!PageCompound(page)) || + + /* + * Skip inserting tail pages of compound page as + * page->buddy_list of tail pages is not usable. + */ + (PageHead(page) && compound_nr(page) <= npages)) + list_add_tail(&page->buddy_list, &snp_leaked_pages_list); + + dump_rmpentry(pfn); + snp_nr_leaked_pages++; + pfn++; + page++; + } + spin_unlock(&snp_leaked_pages_list_lock); +} +EXPORT_SYMBOL_GPL(snp_leak_pages); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + wbinvd(); +} diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile index 46ef8f73aebb..90da47eb85ee 100644 --- a/arch/x86/virt/vmx/tdx/Makefile +++ b/arch/x86/virt/vmx/tdx/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += seamcall.o +obj-y += seamcall.o tdx.o diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c new file mode 100644 index 000000000000..7fdb37387886 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -0,0 +1,1458 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(c) 2023 Intel Corporation. + * + * Intel Trusted Domain Extensions (TDX) support + */ + +#define pr_fmt(fmt) "virt/tdx: " fmt + +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/printk.h> +#include <linux/cpu.h> +#include <linux/spinlock.h> +#include <linux/percpu-defs.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/pfn.h> +#include <linux/align.h> +#include <linux/sort.h> +#include <linux/log2.h> +#include <linux/acpi.h> +#include <linux/suspend.h> +#include <asm/page.h> +#include <asm/special_insns.h> +#include <asm/msr-index.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> +#include <asm/tdx.h> +#include <asm/cpu_device_id.h> +#include <asm/processor.h> +#include <asm/mce.h> +#include "tdx.h" + +static u32 tdx_global_keyid __ro_after_init; +static u32 tdx_guest_keyid_start __ro_after_init; +static u32 tdx_nr_guest_keyids __ro_after_init; + +static DEFINE_PER_CPU(bool, tdx_lp_initialized); + +static struct tdmr_info_list tdx_tdmr_list; + +static enum tdx_module_status_t tdx_module_status; +static DEFINE_MUTEX(tdx_module_lock); + +/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ +static LIST_HEAD(tdx_memlist); + +typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); + +static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) +{ + pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); +} + +static inline void seamcall_err_ret(u64 fn, u64 err, + struct tdx_module_args *args) +{ + seamcall_err(fn, err, args); + pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", + args->rcx, args->rdx, args->r8); + pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", + args->r9, args->r10, args->r11); +} + +static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) +{ + u64 sret = sc_retry(func, fn, args); + + if (sret == TDX_SUCCESS) + return 0; + + if (sret == TDX_SEAMCALL_VMFAILINVALID) + return -ENODEV; + + if (sret == TDX_SEAMCALL_GP) + return -EOPNOTSUPP; + + if (sret == TDX_SEAMCALL_UD) + return -EACCES; + + err_func(fn, sret, args); + return -EIO; +} + +#define seamcall_prerr(__fn, __args) \ + sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) + +#define seamcall_prerr_ret(__fn, __args) \ + sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) + +/* + * Do the module global initialization once and return its result. + * It can be done on any cpu. It's always called with interrupts + * disabled. + */ +static int try_init_module_global(void) +{ + struct tdx_module_args args = {}; + static DEFINE_RAW_SPINLOCK(sysinit_lock); + static bool sysinit_done; + static int sysinit_ret; + + lockdep_assert_irqs_disabled(); + + raw_spin_lock(&sysinit_lock); + + if (sysinit_done) + goto out; + + /* RCX is module attributes and all bits are reserved */ + args.rcx = 0; + sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); + + /* + * The first SEAMCALL also detects the TDX module, thus + * it can fail due to the TDX module is not loaded. + * Dump message to let the user know. + */ + if (sysinit_ret == -ENODEV) + pr_err("module not loaded\n"); + + sysinit_done = true; +out: + raw_spin_unlock(&sysinit_lock); + return sysinit_ret; +} + +/** + * tdx_cpu_enable - Enable TDX on local cpu + * + * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module + * global initialization SEAMCALL if not done) on local cpu to make this + * cpu be ready to run any other SEAMCALLs. + * + * Always call this function via IPI function calls. + * + * Return 0 on success, otherwise errors. + */ +int tdx_cpu_enable(void) +{ + struct tdx_module_args args = {}; + int ret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return -ENODEV; + + lockdep_assert_irqs_disabled(); + + if (__this_cpu_read(tdx_lp_initialized)) + return 0; + + /* + * The TDX module global initialization is the very first step + * to enable TDX. Need to do it first (if hasn't been done) + * before the per-cpu initialization. + */ + ret = try_init_module_global(); + if (ret) + return ret; + + ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); + if (ret) + return ret; + + __this_cpu_write(tdx_lp_initialized, true); + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_cpu_enable); + +/* + * Add a memory region as a TDX memory block. The caller must make sure + * all memory regions are added in address ascending order and don't + * overlap. + */ +static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, + unsigned long end_pfn, int nid) +{ + struct tdx_memblock *tmb; + + tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); + if (!tmb) + return -ENOMEM; + + INIT_LIST_HEAD(&tmb->list); + tmb->start_pfn = start_pfn; + tmb->end_pfn = end_pfn; + tmb->nid = nid; + + /* @tmb_list is protected by mem_hotplug_lock */ + list_add_tail(&tmb->list, tmb_list); + return 0; +} + +static void free_tdx_memlist(struct list_head *tmb_list) +{ + /* @tmb_list is protected by mem_hotplug_lock */ + while (!list_empty(tmb_list)) { + struct tdx_memblock *tmb = list_first_entry(tmb_list, + struct tdx_memblock, list); + + list_del(&tmb->list); + kfree(tmb); + } +} + +/* + * Ensure that all memblock memory regions are convertible to TDX + * memory. Once this has been established, stash the memblock + * ranges off in a secondary structure because memblock is modified + * in memory hotplug while TDX memory regions are fixed. + */ +static int build_tdx_memlist(struct list_head *tmb_list) +{ + unsigned long start_pfn, end_pfn; + int i, nid, ret; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + /* + * The first 1MB is not reported as TDX convertible memory. + * Although the first 1MB is always reserved and won't end up + * to the page allocator, it is still in memblock's memory + * regions. Skip them manually to exclude them as TDX memory. + */ + start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); + if (start_pfn >= end_pfn) + continue; + + /* + * Add the memory regions as TDX memory. The regions in + * memblock has already guaranteed they are in address + * ascending order and don't overlap. + */ + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); + if (ret) + goto err; + } + + return 0; +err: + free_tdx_memlist(tmb_list); + return ret; +} + +static int read_sys_metadata_field(u64 field_id, u64 *data) +{ + struct tdx_module_args args = {}; + int ret; + + /* + * TDH.SYS.RD -- reads one global metadata field + * - RDX (in): the field to read + * - R8 (out): the field data + */ + args.rdx = field_id; + ret = seamcall_prerr_ret(TDH_SYS_RD, &args); + if (ret) + return ret; + + *data = args.r8; + + return 0; +} + +#include "tdx_global_metadata.c" + +static int check_features(struct tdx_sys_info *sysinfo) +{ + u64 tdx_features0 = sysinfo->features.tdx_features0; + + if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { + pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); + return -EINVAL; + } + + return 0; +} + +/* Calculate the actual TDMR size */ +static int tdmr_size_single(u16 max_reserved_per_tdmr) +{ + int tdmr_sz; + + /* + * The actual size of TDMR depends on the maximum + * number of reserved areas. + */ + tdmr_sz = sizeof(struct tdmr_info); + tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; + + return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); +} + +static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, + struct tdx_sys_info_tdmr *sysinfo_tdmr) +{ + size_t tdmr_sz, tdmr_array_sz; + void *tdmr_array; + + tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); + tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; + + /* + * To keep things simple, allocate all TDMRs together. + * The buffer needs to be physically contiguous to make + * sure each TDMR is physically contiguous. + */ + tdmr_array = alloc_pages_exact(tdmr_array_sz, + GFP_KERNEL | __GFP_ZERO); + if (!tdmr_array) + return -ENOMEM; + + tdmr_list->tdmrs = tdmr_array; + + /* + * Keep the size of TDMR to find the target TDMR + * at a given index in the TDMR list. + */ + tdmr_list->tdmr_sz = tdmr_sz; + tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; + tdmr_list->nr_consumed_tdmrs = 0; + + return 0; +} + +static void free_tdmr_list(struct tdmr_info_list *tdmr_list) +{ + free_pages_exact(tdmr_list->tdmrs, + tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); +} + +/* Get the TDMR from the list at the given index. */ +static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, + int idx) +{ + int tdmr_info_offset = tdmr_list->tdmr_sz * idx; + + return (void *)tdmr_list->tdmrs + tdmr_info_offset; +} + +#define TDMR_ALIGNMENT SZ_1G +#define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) +#define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) + +static inline u64 tdmr_end(struct tdmr_info *tdmr) +{ + return tdmr->base + tdmr->size; +} + +/* + * Take the memory referenced in @tmb_list and populate the + * preallocated @tdmr_list, following all the special alignment + * and size rules for TDMR. + */ +static int fill_out_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list) +{ + struct tdx_memblock *tmb; + int tdmr_idx = 0; + + /* + * Loop over TDX memory regions and fill out TDMRs to cover them. + * To keep it simple, always try to use one TDMR to cover one + * memory region. + * + * In practice TDX supports at least 64 TDMRs. A 2-socket system + * typically only consumes less than 10 of those. This code is + * dumb and simple and may use more TMDRs than is strictly + * required. + */ + list_for_each_entry(tmb, tmb_list, list) { + struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); + u64 start, end; + + start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); + end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); + + /* + * A valid size indicates the current TDMR has already + * been filled out to cover the previous memory region(s). + */ + if (tdmr->size) { + /* + * Loop to the next if the current memory region + * has already been fully covered. + */ + if (end <= tdmr_end(tdmr)) + continue; + + /* Otherwise, skip the already covered part. */ + if (start < tdmr_end(tdmr)) + start = tdmr_end(tdmr); + + /* + * Create a new TDMR to cover the current memory + * region, or the remaining part of it. + */ + tdmr_idx++; + if (tdmr_idx >= tdmr_list->max_tdmrs) { + pr_warn("initialization failed: TDMRs exhausted.\n"); + return -ENOSPC; + } + + tdmr = tdmr_entry(tdmr_list, tdmr_idx); + } + + tdmr->base = start; + tdmr->size = end - start; + } + + /* @tdmr_idx is always the index of the last valid TDMR. */ + tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; + + /* + * Warn early that kernel is about to run out of TDMRs. + * + * This is an indication that TDMR allocation has to be + * reworked to be smarter to not run into an issue. + */ + if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) + pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", + tdmr_list->nr_consumed_tdmrs, + tdmr_list->max_tdmrs); + + return 0; +} + +/* + * Calculate PAMT size given a TDMR and a page size. The returned + * PAMT size is always aligned up to 4K page boundary. + */ +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, + u16 pamt_entry_size) +{ + unsigned long pamt_sz, nr_pamt_entries; + + switch (pgsz) { + case TDX_PS_4K: + nr_pamt_entries = tdmr->size >> PAGE_SHIFT; + break; + case TDX_PS_2M: + nr_pamt_entries = tdmr->size >> PMD_SHIFT; + break; + case TDX_PS_1G: + nr_pamt_entries = tdmr->size >> PUD_SHIFT; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + pamt_sz = nr_pamt_entries * pamt_entry_size; + /* TDX requires PAMT size must be 4K aligned */ + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); + + return pamt_sz; +} + +/* + * Locate a NUMA node which should hold the allocation of the @tdmr + * PAMT. This node will have some memory covered by the TDMR. The + * relative amount of memory covered is not considered. + */ +static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) +{ + struct tdx_memblock *tmb; + + /* + * A TDMR must cover at least part of one TMB. That TMB will end + * after the TDMR begins. But, that TMB may have started before + * the TDMR. Find the next 'tmb' that _ends_ after this TDMR + * begins. Ignore 'tmb' start addresses. They are irrelevant. + */ + list_for_each_entry(tmb, tmb_list, list) { + if (tmb->end_pfn > PHYS_PFN(tdmr->base)) + return tmb->nid; + } + + /* + * Fall back to allocating the TDMR's metadata from node 0 when + * no TDX memory block can be found. This should never happen + * since TDMRs originate from TDX memory blocks. + */ + pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", + tdmr->base, tdmr_end(tdmr)); + return 0; +} + +/* + * Allocate PAMTs from the local NUMA node of some memory in @tmb_list + * within @tdmr, and set up PAMTs for @tdmr. + */ +static int tdmr_set_up_pamt(struct tdmr_info *tdmr, + struct list_head *tmb_list, + u16 pamt_entry_size[]) +{ + unsigned long pamt_base[TDX_PS_NR]; + unsigned long pamt_size[TDX_PS_NR]; + unsigned long tdmr_pamt_base; + unsigned long tdmr_pamt_size; + struct page *pamt; + int pgsz, nid; + + nid = tdmr_get_nid(tdmr, tmb_list); + + /* + * Calculate the PAMT size for each TDX supported page size + * and the total PAMT size. + */ + tdmr_pamt_size = 0; + for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, + pamt_entry_size[pgsz]); + tdmr_pamt_size += pamt_size[pgsz]; + } + + /* + * Allocate one chunk of physically contiguous memory for all + * PAMTs. This helps minimize the PAMT's use of reserved areas + * in overlapped TDMRs. + */ + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, + nid, &node_online_map); + if (!pamt) + return -ENOMEM; + + /* + * Break the contiguous allocation back up into the + * individual PAMTs for each page size. + */ + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; + for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { + pamt_base[pgsz] = tdmr_pamt_base; + tdmr_pamt_base += pamt_size[pgsz]; + } + + tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; + tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; + tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; + tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; + tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; + tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; + + return 0; +} + +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, + unsigned long *pamt_size) +{ + unsigned long pamt_bs, pamt_sz; + + /* + * The PAMT was allocated in one contiguous unit. The 4K PAMT + * should always point to the beginning of that allocation. + */ + pamt_bs = tdmr->pamt_4k_base; + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; + + WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); + + *pamt_base = pamt_bs; + *pamt_size = pamt_sz; +} + +static void tdmr_do_pamt_func(struct tdmr_info *tdmr, + void (*pamt_func)(unsigned long base, unsigned long size)) +{ + unsigned long pamt_base, pamt_size; + + tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); + + /* Do nothing if PAMT hasn't been allocated for this TDMR */ + if (!pamt_size) + return; + + if (WARN_ON_ONCE(!pamt_base)) + return; + + pamt_func(pamt_base, pamt_size); +} + +static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) +{ + free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); +} + +static void tdmr_free_pamt(struct tdmr_info *tdmr) +{ + tdmr_do_pamt_func(tdmr, free_pamt); +} + +static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_free_pamt(tdmr_entry(tdmr_list, i)); +} + +/* Allocate and set up PAMTs for all TDMRs */ +static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 pamt_entry_size[]) +{ + int i, ret = 0; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, + pamt_entry_size); + if (ret) + goto err; + } + + return 0; +err: + tdmrs_free_pamt_all(tdmr_list); + return ret; +} + +/* + * Convert TDX private pages back to normal by using MOVDIR64B to + * clear these pages. Note this function doesn't flush cache of + * these TDX private pages. The caller should make sure of that. + */ +static void reset_tdx_pages(unsigned long base, unsigned long size) +{ + const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); + unsigned long phys, end; + + end = base + size; + for (phys = base; phys < end; phys += 64) + movdir64b(__va(phys), zero_page); + + /* + * MOVDIR64B uses WC protocol. Use memory barrier to + * make sure any later user of these pages sees the + * updated data. + */ + mb(); +} + +static void tdmr_reset_pamt(struct tdmr_info *tdmr) +{ + tdmr_do_pamt_func(tdmr, reset_tdx_pages); +} + +static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); +} + +static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) +{ + unsigned long pamt_size = 0; + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long base, size; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); + pamt_size += size; + } + + return pamt_size / 1024; +} + +static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, + u64 size, u16 max_reserved_per_tdmr) +{ + struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; + int idx = *p_idx; + + /* Reserved area must be 4K aligned in offset and size */ + if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) + return -EINVAL; + + if (idx >= max_reserved_per_tdmr) { + pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", + tdmr->base, tdmr_end(tdmr)); + return -ENOSPC; + } + + /* + * Consume one reserved area per call. Make no effort to + * optimize or reduce the number of reserved areas which are + * consumed by contiguous reserved areas, for instance. + */ + rsvd_areas[idx].offset = addr - tdmr->base; + rsvd_areas[idx].size = size; + + *p_idx = idx + 1; + + return 0; +} + +/* + * Go through @tmb_list to find holes between memory areas. If any of + * those holes fall within @tdmr, set up a TDMR reserved area to cover + * the hole. + */ +static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) +{ + struct tdx_memblock *tmb; + u64 prev_end; + int ret; + + /* + * Start looking for reserved blocks at the + * beginning of the TDMR. + */ + prev_end = tdmr->base; + list_for_each_entry(tmb, tmb_list, list) { + u64 start, end; + + start = PFN_PHYS(tmb->start_pfn); + end = PFN_PHYS(tmb->end_pfn); + + /* Break if this region is after the TDMR */ + if (start >= tdmr_end(tdmr)) + break; + + /* Exclude regions before this TDMR */ + if (end < tdmr->base) + continue; + + /* + * Skip over memory areas that + * have already been dealt with. + */ + if (start <= prev_end) { + prev_end = end; + continue; + } + + /* Add the hole before this region */ + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, + start - prev_end, + max_reserved_per_tdmr); + if (ret) + return ret; + + prev_end = end; + } + + /* Add the hole after the last region if it exists. */ + if (prev_end < tdmr_end(tdmr)) { + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, + tdmr_end(tdmr) - prev_end, + max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Go through @tdmr_list to find all PAMTs. If any of those PAMTs + * overlaps with @tdmr, set up a TDMR reserved area to cover the + * overlapping part. + */ +static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) +{ + int i, ret; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); + unsigned long pamt_base, pamt_size, pamt_end; + + tdmr_get_pamt(tmp, &pamt_base, &pamt_size); + /* Each TDMR must already have PAMT allocated */ + WARN_ON_ONCE(!pamt_size || !pamt_base); + + pamt_end = pamt_base + pamt_size; + /* Skip PAMTs outside of the given TDMR */ + if ((pamt_end <= tdmr->base) || + (pamt_base >= tdmr_end(tdmr))) + continue; + + /* Only mark the part within the TDMR as reserved */ + if (pamt_base < tdmr->base) + pamt_base = tdmr->base; + if (pamt_end > tdmr_end(tdmr)) + pamt_end = tdmr_end(tdmr); + + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, + pamt_end - pamt_base, + max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + +/* Compare function called by sort() for TDMR reserved areas */ +static int rsvd_area_cmp_func(const void *a, const void *b) +{ + struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; + struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; + + if (r1->offset + r1->size <= r2->offset) + return -1; + if (r1->offset >= r2->offset + r2->size) + return 1; + + /* Reserved areas cannot overlap. The caller must guarantee. */ + WARN_ON_ONCE(1); + return -1; +} + +/* + * Populate reserved areas for the given @tdmr, including memory holes + * (via @tmb_list) and PAMTs (via @tdmr_list). + */ +static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, + struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + u16 max_reserved_per_tdmr) +{ + int ret, rsvd_idx = 0; + + ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, + max_reserved_per_tdmr); + if (ret) + return ret; + + ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, + max_reserved_per_tdmr); + if (ret) + return ret; + + /* TDX requires reserved areas listed in address ascending order */ + sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), + rsvd_area_cmp_func, NULL); + + return 0; +} + +/* + * Populate reserved areas for all TDMRs in @tdmr_list, including memory + * holes (via @tmb_list) and PAMTs. + */ +static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 max_reserved_per_tdmr) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + int ret; + + ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), + tmb_list, tdmr_list, max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Construct a list of TDMRs on the preallocated space in @tdmr_list + * to cover all TDX memory regions in @tmb_list based on the TDX module + * TDMR global information in @sysinfo_tdmr. + */ +static int construct_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + struct tdx_sys_info_tdmr *sysinfo_tdmr) +{ + u16 pamt_entry_size[TDX_PS_NR] = { + sysinfo_tdmr->pamt_4k_entry_size, + sysinfo_tdmr->pamt_2m_entry_size, + sysinfo_tdmr->pamt_1g_entry_size, + }; + int ret; + + ret = fill_out_tdmrs(tmb_list, tdmr_list); + if (ret) + return ret; + + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); + if (ret) + return ret; + + ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, + sysinfo_tdmr->max_reserved_per_tdmr); + if (ret) + tdmrs_free_pamt_all(tdmr_list); + + /* + * The tdmr_info_list is read-only from here on out. + * Ensure that these writes are seen by other CPUs. + * Pairs with a smp_rmb() in is_pamt_page(). + */ + smp_wmb(); + + return ret; +} + +static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) +{ + struct tdx_module_args args = {}; + u64 *tdmr_pa_array; + size_t array_sz; + int i, ret; + + /* + * TDMRs are passed to the TDX module via an array of physical + * addresses of each TDMR. The array itself also has certain + * alignment requirement. + */ + array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); + array_sz = roundup_pow_of_two(array_sz); + if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) + array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; + + tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); + if (!tdmr_pa_array) + return -ENOMEM; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); + + args.rcx = __pa(tdmr_pa_array); + args.rdx = tdmr_list->nr_consumed_tdmrs; + args.r8 = global_keyid; + ret = seamcall_prerr(TDH_SYS_CONFIG, &args); + + /* Free the array as it is not required anymore. */ + kfree(tdmr_pa_array); + + return ret; +} + +static int do_global_key_config(void *unused) +{ + struct tdx_module_args args = {}; + + return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); +} + +/* + * Attempt to configure the global KeyID on all physical packages. + * + * This requires running code on at least one CPU in each package. + * TDMR initialization) will fail will fail if any package in the + * system has no online CPUs. + * + * This code takes no affirmative steps to online CPUs. Callers (aka. + * KVM) can ensure success by ensuring sufficient CPUs are online and + * can run SEAMCALLs. + */ +static int config_global_keyid(void) +{ + cpumask_var_t packages; + int cpu, ret = -EINVAL; + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + return -ENOMEM; + + /* + * Hardware doesn't guarantee cache coherency across different + * KeyIDs. The kernel needs to flush PAMT's dirty cachelines + * (associated with KeyID 0) before the TDX module can use the + * global KeyID to access the PAMT. Given PAMTs are potentially + * large (~1/256th of system RAM), just use WBINVD. + */ + wbinvd_on_all_cpus(); + + for_each_online_cpu(cpu) { + /* + * The key configuration only needs to be done once per + * package and will return an error if configured more + * than once. Avoid doing it multiple times per package. + */ + if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), + packages)) + continue; + + /* + * TDH.SYS.KEY.CONFIG cannot run concurrently on + * different cpus. Do it one by one. + */ + ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); + if (ret) + break; + } + + free_cpumask_var(packages); + return ret; +} + +static int init_tdmr(struct tdmr_info *tdmr) +{ + u64 next; + + /* + * Initializing a TDMR can be time consuming. To avoid long + * SEAMCALLs, the TDX module may only initialize a part of the + * TDMR in each call. + */ + do { + struct tdx_module_args args = { + .rcx = tdmr->base, + }; + int ret; + + ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); + if (ret) + return ret; + /* + * RDX contains 'next-to-initialize' address if + * TDH.SYS.TDMR.INIT did not fully complete and + * should be retried. + */ + next = args.rdx; + cond_resched(); + /* Keep making SEAMCALLs until the TDMR is done */ + } while (next < tdmr->base + tdmr->size); + + return 0; +} + +static int init_tdmrs(struct tdmr_info_list *tdmr_list) +{ + int i; + + /* + * This operation is costly. It can be parallelized, + * but keep it simple for now. + */ + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + int ret; + + ret = init_tdmr(tdmr_entry(tdmr_list, i)); + if (ret) + return ret; + } + + return 0; +} + +static int init_tdx_module(void) +{ + struct tdx_sys_info sysinfo; + int ret; + + ret = get_tdx_sys_info(&sysinfo); + if (ret) + return ret; + + /* Check whether the kernel can support this module */ + ret = check_features(&sysinfo); + if (ret) + return ret; + + /* + * To keep things simple, assume that all TDX-protected memory + * will come from the page allocator. Make sure all pages in the + * page allocator are TDX-usable memory. + * + * Build the list of "TDX-usable" memory regions which cover all + * pages in the page allocator to guarantee that. Do it while + * holding mem_hotplug_lock read-lock as the memory hotplug code + * path reads the @tdx_memlist to reject any new memory. + */ + get_online_mems(); + + ret = build_tdx_memlist(&tdx_memlist); + if (ret) + goto out_put_tdxmem; + + /* Allocate enough space for constructing TDMRs */ + ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr); + if (ret) + goto err_free_tdxmem; + + /* Cover all TDX-usable memory regions in TDMRs */ + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr); + if (ret) + goto err_free_tdmrs; + + /* Pass the TDMRs and the global KeyID to the TDX module */ + ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); + if (ret) + goto err_free_pamts; + + /* Config the key of global KeyID on all packages */ + ret = config_global_keyid(); + if (ret) + goto err_reset_pamts; + + /* Initialize TDMRs to complete the TDX module initialization */ + ret = init_tdmrs(&tdx_tdmr_list); + if (ret) + goto err_reset_pamts; + + pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); + +out_put_tdxmem: + /* + * @tdx_memlist is written here and read at memory hotplug time. + * Lock out memory hotplug code while building it. + */ + put_online_mems(); + return ret; + +err_reset_pamts: + /* + * Part of PAMTs may already have been initialized by the + * TDX module. Flush cache before returning PAMTs back + * to the kernel. + */ + wbinvd_on_all_cpus(); + /* + * According to the TDX hardware spec, if the platform + * doesn't have the "partial write machine check" + * erratum, any kernel read/write will never cause #MC + * in kernel space, thus it's OK to not convert PAMTs + * back to normal. But do the conversion anyway here + * as suggested by the TDX spec. + */ + tdmrs_reset_pamt_all(&tdx_tdmr_list); +err_free_pamts: + tdmrs_free_pamt_all(&tdx_tdmr_list); +err_free_tdmrs: + free_tdmr_list(&tdx_tdmr_list); +err_free_tdxmem: + free_tdx_memlist(&tdx_memlist); + goto out_put_tdxmem; +} + +static int __tdx_enable(void) +{ + int ret; + + ret = init_tdx_module(); + if (ret) { + pr_err("module initialization failed (%d)\n", ret); + tdx_module_status = TDX_MODULE_ERROR; + return ret; + } + + pr_info("module initialized\n"); + tdx_module_status = TDX_MODULE_INITIALIZED; + + return 0; +} + +/** + * tdx_enable - Enable TDX module to make it ready to run TDX guests + * + * This function assumes the caller has: 1) held read lock of CPU hotplug + * lock to prevent any new cpu from becoming online; 2) done both VMXON + * and tdx_cpu_enable() on all online cpus. + * + * This function requires there's at least one online cpu for each CPU + * package to succeed. + * + * This function can be called in parallel by multiple callers. + * + * Return 0 if TDX is enabled successfully, otherwise error. + */ +int tdx_enable(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return -ENODEV; + + lockdep_assert_cpus_held(); + + mutex_lock(&tdx_module_lock); + + switch (tdx_module_status) { + case TDX_MODULE_UNINITIALIZED: + ret = __tdx_enable(); + break; + case TDX_MODULE_INITIALIZED: + /* Already initialized, great, tell the caller. */ + ret = 0; + break; + default: + /* Failed to initialize in the previous attempts */ + ret = -EINVAL; + break; + } + + mutex_unlock(&tdx_module_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tdx_enable); + +static bool is_pamt_page(unsigned long phys) +{ + struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; + int i; + + /* Ensure that all remote 'tdmr_list' writes are visible: */ + smp_rmb(); + + /* + * The TDX module is no longer returning TDX_SYS_NOT_READY and + * is initialized. The 'tdmr_list' was initialized long ago + * and is now read-only. + */ + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long base, size; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); + + if (phys >= base && phys < (base + size)) + return true; + } + + return false; +} + +/* + * Return whether the memory page at the given physical address is TDX + * private memory or not. + * + * This can be imprecise for two known reasons: + * 1. PAMTs are private memory and exist before the TDX module is + * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively + * short window that occurs once per boot. + * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the + * page. However, the page can still cause #MC until it has been + * fully converted to shared using 64-byte writes like MOVDIR64B. + * Buggy hosts might still leave #MC-causing memory in place which + * this function can not detect. + */ +static bool paddr_is_tdx_private(unsigned long phys) +{ + struct tdx_module_args args = { + .rcx = phys & PAGE_MASK, + }; + u64 sret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return false; + + /* Get page type from the TDX module */ + sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); + + /* + * The SEAMCALL will not return success unless there is a + * working, "ready" TDX module. Assume an absence of TDX + * private pages until SEAMCALL is working. + */ + if (sret) + return false; + + /* + * SEAMCALL was successful -- read page type (via RCX): + * + * - PT_NDA: Page is not used by the TDX module + * - PT_RSVD: Reserved for Non-TDX use + * - Others: Page is used by the TDX module + * + * Note PAMT pages are marked as PT_RSVD but they are also TDX + * private memory. + */ + switch (args.rcx) { + case PT_NDA: + return false; + case PT_RSVD: + return is_pamt_page(phys); + default: + return true; + } +} + +/* + * Some TDX-capable CPUs have an erratum. A write to TDX private + * memory poisons that memory, and a subsequent read of that memory + * triggers #MC. + * + * Help distinguish erratum-triggered #MCs from a normal hardware one. + * Just print additional message to show such #MC may be result of the + * erratum. + */ +const char *tdx_dump_mce_info(struct mce *m) +{ + if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) + return NULL; + + if (!paddr_is_tdx_private(m->addr)) + return NULL; + + return "TDX private memory error. Possible kernel bug."; +} + +static __init int record_keyid_partitioning(u32 *tdx_keyid_start, + u32 *nr_tdx_keyids) +{ + u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; + int ret; + + /* + * IA32_MKTME_KEYID_PARTIONING: + * Bit [31:0]: Number of MKTME KeyIDs. + * Bit [63:32]: Number of TDX private KeyIDs. + */ + ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, + &_nr_tdx_keyids); + if (ret || !_nr_tdx_keyids) + return -EINVAL; + + /* TDX KeyIDs start after the last MKTME KeyID. */ + _tdx_keyid_start = _nr_mktme_keyids + 1; + + *tdx_keyid_start = _tdx_keyid_start; + *nr_tdx_keyids = _nr_tdx_keyids; + + return 0; +} + +static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) +{ + struct tdx_memblock *tmb; + + /* + * This check assumes that the start_pfn<->end_pfn range does not + * cross multiple @tdx_memlist entries. A single memory online + * event across multiple memblocks (from which @tdx_memlist + * entries are derived at the time of module initialization) is + * not possible. This is because memory offline/online is done + * on granularity of 'struct memory_block', and the hotpluggable + * memory region (one memblock) must be multiple of memory_block. + */ + list_for_each_entry(tmb, &tdx_memlist, list) { + if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) + return true; + } + return false; +} + +static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, + void *v) +{ + struct memory_notify *mn = v; + + if (action != MEM_GOING_ONLINE) + return NOTIFY_OK; + + /* + * Empty list means TDX isn't enabled. Allow any memory + * to go online. + */ + if (list_empty(&tdx_memlist)) + return NOTIFY_OK; + + /* + * The TDX memory configuration is static and can not be + * changed. Reject onlining any memory which is outside of + * the static configuration whether it supports TDX or not. + */ + if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) + return NOTIFY_OK; + + return NOTIFY_BAD; +} + +static struct notifier_block tdx_memory_nb = { + .notifier_call = tdx_memory_notifier, +}; + +static void __init check_tdx_erratum(void) +{ + /* + * These CPUs have an erratum. A partial write from non-TD + * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX + * private memory poisons that memory, and a subsequent read of + * that memory triggers #MC. + */ + switch (boot_cpu_data.x86_vfm) { + case INTEL_SAPPHIRERAPIDS_X: + case INTEL_EMERALDRAPIDS_X: + setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); + } +} + +void __init tdx_init(void) +{ + u32 tdx_keyid_start, nr_tdx_keyids; + int err; + + err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); + if (err) + return; + + pr_info("BIOS enabled: private KeyID range [%u, %u)\n", + tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); + + /* + * The TDX module itself requires one 'global KeyID' to protect + * its metadata. If there's only one TDX KeyID, there won't be + * any left for TDX guests thus there's no point to enable TDX + * at all. + */ + if (nr_tdx_keyids < 2) { + pr_err("initialization failed: too few private KeyIDs available.\n"); + return; + } + + /* + * At this point, hibernation_available() indicates whether or + * not hibernation support has been permanently disabled. + */ + if (hibernation_available()) { + pr_err("initialization failed: Hibernation support is enabled\n"); + return; + } + + err = register_memory_notifier(&tdx_memory_nb); + if (err) { + pr_err("initialization failed: register_memory_notifier() failed (%d)\n", + err); + return; + } + +#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) + pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); + acpi_suspend_lowlevel = NULL; +#endif + + /* + * Just use the first TDX KeyID as the 'global KeyID' and + * leave the rest for TDX guests. + */ + tdx_global_keyid = tdx_keyid_start; + tdx_guest_keyid_start = tdx_keyid_start + 1; + tdx_nr_guest_keyids = nr_tdx_keyids - 1; + + setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); + + check_tdx_erratum(); +} diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h new file mode 100644 index 000000000000..4e3d533cdd61 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_VIRT_TDX_H +#define _X86_VIRT_TDX_H + +#include <linux/bits.h> +#include "tdx_global_metadata.h" + +/* + * This file contains both macros and data structures defined by the TDX + * architecture and Linux defined software data structures and functions. + * The two should not be mixed together for better readability. The + * architectural definitions come first. + */ + +/* + * TDX module SEAMCALL leaf functions + */ +#define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_SYS_KEY_CONFIG 31 +#define TDH_SYS_INIT 33 +#define TDH_SYS_RD 34 +#define TDH_SYS_LP_INIT 35 +#define TDH_SYS_TDMR_INIT 36 +#define TDH_SYS_CONFIG 45 + +/* TDX page types */ +#define PT_NDA 0x0 +#define PT_RSVD 0x1 + +struct tdmr_reserved_area { + u64 offset; + u64 size; +} __packed; + +#define TDMR_INFO_ALIGNMENT 512 +#define TDMR_INFO_PA_ARRAY_ALIGNMENT 512 + +struct tdmr_info { + u64 base; + u64 size; + u64 pamt_1g_base; + u64 pamt_1g_size; + u64 pamt_2m_base; + u64 pamt_2m_size; + u64 pamt_4k_base; + u64 pamt_4k_size; + /* + * The actual number of reserved areas depends on the value of + * field MD_FIELD_ID_MAX_RESERVED_PER_TDMR in the TDX module + * global metadata. + */ + DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); +} __packed __aligned(TDMR_INFO_ALIGNMENT); + +/* Bit definitions of TDX_FEATURES0 metadata field */ +#define TDX_FEATURES0_NO_RBP_MOD BIT(18) + +/* + * Do not put any hardware-defined TDX structure representations below + * this comment! + */ + +/* Kernel defined TDX module status during module initialization. */ +enum tdx_module_status_t { + TDX_MODULE_UNINITIALIZED, + TDX_MODULE_INITIALIZED, + TDX_MODULE_ERROR +}; + +struct tdx_memblock { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +}; + +/* Warn if kernel has less than TDMR_NR_WARN TDMRs after allocation */ +#define TDMR_NR_WARN 4 + +struct tdmr_info_list { + void *tdmrs; /* Flexible array to hold 'tdmr_info's */ + int nr_consumed_tdmrs; /* How many 'tdmr_info's are in use */ + + /* Metadata for finding target 'tdmr_info' and freeing @tdmrs */ + int tdmr_sz; /* Size of one 'tdmr_info' */ + int max_tdmrs; /* How many 'tdmr_info's are allocated */ +}; + +#endif diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c new file mode 100644 index 000000000000..8027a24d1c6e --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Automatically generated functions to read TDX global metadata. + * + * This file doesn't compile on its own as it lacks of inclusion + * of SEAMCALL wrapper primitive which reads global metadata. + * Include this file to other C file instead. + */ + +static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x0A00000300000008, &val))) + sysinfo_features->tdx_features0 = val; + + return ret; +} + +static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000008, &val))) + sysinfo_tdmr->max_tdmrs = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000009, &val))) + sysinfo_tdmr->max_reserved_per_tdmr = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000010, &val))) + sysinfo_tdmr->pamt_4k_entry_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000011, &val))) + sysinfo_tdmr->pamt_2m_entry_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000012, &val))) + sysinfo_tdmr->pamt_1g_entry_size = val; + + return ret; +} + +static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) +{ + int ret = 0; + + ret = ret ?: get_tdx_sys_info_features(&sysinfo->features); + ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr); + + return ret; +} diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h new file mode 100644 index 000000000000..6dd3c9695f59 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include <linux/types.h> + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; +}; + +#endif |