summaryrefslogtreecommitdiff
path: root/arch/x86/virt
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/virt')
-rw-r--r--arch/x86/virt/Makefile2
-rw-r--r--arch/x86/virt/svm/Makefile4
-rw-r--r--arch/x86/virt/svm/cmdline.c45
-rw-r--r--arch/x86/virt/svm/sev.c1072
-rw-r--r--arch/x86/virt/vmx/tdx/Makefile2
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c1458
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.h89
-rw-r--r--arch/x86/virt/vmx/tdx/tdx_global_metadata.c48
-rw-r--r--arch/x86/virt/vmx/tdx/tdx_global_metadata.h25
9 files changed, 2743 insertions, 2 deletions
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile
index 1e36502cd738..ea343fc392dc 100644
--- a/arch/x86/virt/Makefile
+++ b/arch/x86/virt/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += vmx/
+obj-y += svm/ vmx/
diff --git a/arch/x86/virt/svm/Makefile b/arch/x86/virt/svm/Makefile
new file mode 100644
index 000000000000..eca6d71355fa
--- /dev/null
+++ b/arch/x86/virt/svm/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KVM_AMD_SEV) += sev.o
+obj-$(CONFIG_CPU_SUP_AMD) += cmdline.o
diff --git a/arch/x86/virt/svm/cmdline.c b/arch/x86/virt/svm/cmdline.c
new file mode 100644
index 000000000000..affa2759fa20
--- /dev/null
+++ b/arch/x86/virt/svm/cmdline.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM-SEV command line parsing support
+ *
+ * Copyright (C) 2023 - 2024 Advanced Micro Devices, Inc.
+ *
+ * Author: Michael Roth <michael.roth@amd.com>
+ */
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/cache.h>
+#include <linux/cpufeature.h>
+
+#include <asm/sev-common.h>
+
+struct sev_config sev_cfg __read_mostly;
+
+static int __init init_sev_config(char *str)
+{
+ char *s;
+
+ while ((s = strsep(&str, ","))) {
+ if (!strcmp(s, "debug")) {
+ sev_cfg.debug = true;
+ continue;
+ }
+
+ if (!strcmp(s, "nosnp")) {
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ continue;
+ } else {
+ goto warn;
+ }
+ }
+
+warn:
+ pr_info("SEV command-line option '%s' was not recognized\n", s);
+ }
+
+ return 1;
+}
+__setup("sev=", init_sev_config);
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
new file mode 100644
index 000000000000..42e74a5a7d78
--- /dev/null
+++ b/arch/x86/virt/svm/sev.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM-SEV Host Support.
+ *
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ *
+ * Author: Ashish Kalra <ashish.kalra@amd.com>
+ *
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/iommu.h>
+#include <linux/amd-iommu.h>
+#include <linux/nospec.h>
+
+#include <asm/sev.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+#include <asm/cmdline.h>
+#include <asm/iommu.h>
+
+/*
+ * The RMP entry information as returned by the RMPREAD instruction.
+ */
+struct rmpentry {
+ u64 gpa;
+ u8 assigned :1,
+ rsvd1 :7;
+ u8 pagesize :1,
+ hpage_region_status :1,
+ rsvd2 :6;
+ u8 immutable :1,
+ rsvd3 :7;
+ u8 rsvd4;
+ u32 asid;
+} __packed;
+
+/*
+ * The raw RMP entry format is not architectural. The format is defined in PPR
+ * Family 19h Model 01h, Rev B1 processor. This format represents the actual
+ * entry in the RMP table memory. The bitfield definitions are used for machines
+ * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo"
+ * fields are only used for dumping the raw data.
+ */
+struct rmpentry_raw {
+ union {
+ struct {
+ u64 assigned : 1,
+ pagesize : 1,
+ immutable : 1,
+ rsvd1 : 9,
+ gpa : 39,
+ asid : 10,
+ vmsa : 1,
+ validated : 1,
+ rsvd2 : 1;
+ };
+ u64 lo;
+ };
+ u64 hi;
+} __packed;
+
+/*
+ * The first 16KB from the RMP_BASE is used by the processor for the
+ * bookkeeping, the range needs to be added during the RMP entry lookup.
+ */
+#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
+
+/*
+ * For a non-segmented RMP table, use the maximum physical addressing as the
+ * segment size in order to always arrive at index 0 in the table.
+ */
+#define RMPTABLE_NON_SEGMENTED_SHIFT 52
+
+struct rmp_segment_desc {
+ struct rmpentry_raw *rmp_entry;
+ u64 max_index;
+ u64 size;
+};
+
+/*
+ * Segmented RMP Table support.
+ * - The segment size is used for two purposes:
+ * - Identify the amount of memory covered by an RMP segment
+ * - Quickly locate an RMP segment table entry for a physical address
+ *
+ * - The RMP segment table contains pointers to an RMP table that covers
+ * a specific portion of memory. There can be up to 512 8-byte entries,
+ * one pages worth.
+ */
+#define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0))
+#define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20))
+
+#define RST_SIZE SZ_4K
+static struct rmp_segment_desc **rmp_segment_table __ro_after_init;
+static unsigned int rst_max_index __ro_after_init = 512;
+
+static unsigned int rmp_segment_shift;
+static u64 rmp_segment_size;
+static u64 rmp_segment_mask;
+
+#define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift)
+#define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask)))
+
+static u64 rmp_cfg;
+
+/* Mask to apply to a PFN to get the first PFN of a 2MB page */
+#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
+
+static u64 probed_rmp_base, probed_rmp_size;
+
+static LIST_HEAD(snp_leaked_pages_list);
+static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
+
+static unsigned long snp_nr_leaked_pages;
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SEV-SNP: " fmt
+
+static int __mfd_enable(unsigned int cpu)
+{
+ u64 val;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return 0;
+
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+
+ val |= MSR_AMD64_SYSCFG_MFDM;
+
+ wrmsrl(MSR_AMD64_SYSCFG, val);
+
+ return 0;
+}
+
+static __init void mfd_enable(void *arg)
+{
+ __mfd_enable(smp_processor_id());
+}
+
+static int __snp_enable(unsigned int cpu)
+{
+ u64 val;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return 0;
+
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+
+ val |= MSR_AMD64_SYSCFG_SNP_EN;
+ val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
+
+ wrmsrl(MSR_AMD64_SYSCFG, val);
+
+ return 0;
+}
+
+static __init void snp_enable(void *arg)
+{
+ __snp_enable(smp_processor_id());
+}
+
+static void __init __snp_fixup_e820_tables(u64 pa)
+{
+ if (IS_ALIGNED(pa, PMD_SIZE))
+ return;
+
+ /*
+ * Handle cases where the RMP table placement by the BIOS is not
+ * 2M aligned and the kexec kernel could try to allocate
+ * from within that chunk which then causes a fatal RMP fault.
+ *
+ * The e820_table needs to be updated as it is converted to
+ * kernel memory resources and used by KEXEC_FILE_LOAD syscall
+ * to load kexec segments.
+ *
+ * The e820_table_firmware needs to be updated as it is exposed
+ * to sysfs and used by the KEXEC_LOAD syscall to load kexec
+ * segments.
+ *
+ * The e820_table_kexec needs to be updated as it passed to
+ * the kexec-ed kernel.
+ */
+ pa = ALIGN_DOWN(pa, PMD_SIZE);
+ if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
+ pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
+ e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ if (!memblock_is_region_reserved(pa, PMD_SIZE))
+ memblock_reserve(pa, PMD_SIZE);
+ }
+}
+
+static void __init fixup_e820_tables_for_segmented_rmp(void)
+{
+ u64 pa, *rst, size, mapped_size;
+ unsigned int i;
+
+ __snp_fixup_e820_tables(probed_rmp_base);
+
+ pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
+
+ __snp_fixup_e820_tables(pa + RST_SIZE);
+
+ rst = early_memremap(pa, RST_SIZE);
+ if (!rst)
+ return;
+
+ for (i = 0; i < rst_max_index; i++) {
+ pa = RST_ENTRY_SEGMENT_BASE(rst[i]);
+ mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
+ if (!mapped_size)
+ continue;
+
+ __snp_fixup_e820_tables(pa);
+
+ /*
+ * Mapped size in GB. Mapped size is allowed to exceed
+ * the segment coverage size, but gets reduced to the
+ * segment coverage size.
+ */
+ mapped_size <<= 30;
+ if (mapped_size > rmp_segment_size)
+ mapped_size = rmp_segment_size;
+
+ /* Calculate the RMP segment size (16 bytes/page mapped) */
+ size = PHYS_PFN(mapped_size) << 4;
+
+ __snp_fixup_e820_tables(pa + size);
+ }
+
+ early_memunmap(rst, RST_SIZE);
+}
+
+static void __init fixup_e820_tables_for_contiguous_rmp(void)
+{
+ __snp_fixup_e820_tables(probed_rmp_base);
+ __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
+}
+
+void __init snp_fixup_e820_tables(void)
+{
+ if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
+ fixup_e820_tables_for_segmented_rmp();
+ } else {
+ fixup_e820_tables_for_contiguous_rmp();
+ }
+}
+
+static bool __init clear_rmptable_bookkeeping(void)
+{
+ void *bk;
+
+ bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB);
+ if (!bk) {
+ pr_err("Failed to map RMP bookkeeping area\n");
+ return false;
+ }
+
+ memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ);
+
+ memunmap(bk);
+
+ return true;
+}
+
+static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa)
+{
+ u64 rst_index, rmp_segment_size_max;
+ struct rmp_segment_desc *desc;
+ void *rmp_segment;
+
+ /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */
+ rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4;
+
+ /* Validate the RMP segment size */
+ if (segment_size > rmp_segment_size_max) {
+ pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n",
+ segment_size, rmp_segment_size_max);
+ return false;
+ }
+
+ /* Validate the RMP segment table index */
+ rst_index = RST_ENTRY_INDEX(pa);
+ if (rst_index >= rst_max_index) {
+ pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n",
+ pa, rmp_segment_size);
+ return false;
+ }
+
+ if (rmp_segment_table[rst_index]) {
+ pr_err("RMP segment descriptor already exists at index %llu\n", rst_index);
+ return false;
+ }
+
+ rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB);
+ if (!rmp_segment) {
+ pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n",
+ segment_pa, segment_size);
+ return false;
+ }
+
+ desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ if (!desc) {
+ memunmap(rmp_segment);
+ return false;
+ }
+
+ desc->rmp_entry = rmp_segment;
+ desc->max_index = segment_size / sizeof(*desc->rmp_entry);
+ desc->size = segment_size;
+
+ rmp_segment_table[rst_index] = desc;
+
+ return true;
+}
+
+static void __init free_rmp_segment_table(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < rst_max_index; i++) {
+ struct rmp_segment_desc *desc;
+
+ desc = rmp_segment_table[i];
+ if (!desc)
+ continue;
+
+ memunmap(desc->rmp_entry);
+
+ kfree(desc);
+ }
+
+ free_page((unsigned long)rmp_segment_table);
+
+ rmp_segment_table = NULL;
+}
+
+/* Allocate the table used to index into the RMP segments */
+static bool __init alloc_rmp_segment_table(void)
+{
+ struct page *page;
+
+ page = alloc_page(__GFP_ZERO);
+ if (!page)
+ return false;
+
+ rmp_segment_table = page_address(page);
+
+ return true;
+}
+
+static bool __init setup_contiguous_rmptable(void)
+{
+ u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end;
+
+ if (!probed_rmp_size)
+ return false;
+
+ rmp_end = probed_rmp_base + probed_rmp_size - 1;
+
+ /*
+ * Calculate the amount of memory that must be reserved by the BIOS to
+ * address the whole RAM, including the bookkeeping area. The RMP itself
+ * must also be covered.
+ */
+ max_rmp_pfn = max_pfn;
+ if (PFN_UP(rmp_end) > max_pfn)
+ max_rmp_pfn = PFN_UP(rmp_end);
+
+ calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
+ if (calc_rmp_sz > probed_rmp_size) {
+ pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
+ calc_rmp_sz, probed_rmp_size);
+ return false;
+ }
+
+ if (!alloc_rmp_segment_table())
+ return false;
+
+ /* Map only the RMP entries */
+ rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
+ rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
+
+ if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) {
+ free_rmp_segment_table();
+ return false;
+ }
+
+ return true;
+}
+
+static bool __init setup_segmented_rmptable(void)
+{
+ u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max;
+ unsigned int i, max_index;
+
+ if (!probed_rmp_base)
+ return false;
+
+ if (!alloc_rmp_segment_table())
+ return false;
+
+ rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
+ rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB);
+ if (!rst) {
+ pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa);
+ goto e_free;
+ }
+
+ pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30);
+
+ ram_pa_max = max_pfn << PAGE_SHIFT;
+
+ max_index = 0;
+ ram_pa_end = 0;
+ for (i = 0; i < rst_max_index; i++) {
+ u64 rmp_segment, rmp_size, mapped_size;
+
+ mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
+ if (!mapped_size)
+ continue;
+
+ max_index = i;
+
+ /*
+ * Mapped size in GB. Mapped size is allowed to exceed the
+ * segment coverage size, but gets reduced to the segment
+ * coverage size.
+ */
+ mapped_size <<= 30;
+ if (mapped_size > rmp_segment_size) {
+ pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n",
+ i, mapped_size, rmp_segment_size);
+ mapped_size = rmp_segment_size;
+ }
+
+ rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]);
+
+ /* Calculate the RMP segment size (16 bytes/page mapped) */
+ rmp_size = PHYS_PFN(mapped_size) << 4;
+
+ pa = (u64)i << rmp_segment_shift;
+
+ /*
+ * Some segments may be for MMIO mapped above system RAM. These
+ * segments are used for Trusted I/O.
+ */
+ if (pa < ram_pa_max)
+ ram_pa_end = pa + mapped_size;
+
+ if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa))
+ goto e_unmap;
+
+ pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n",
+ i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1);
+ }
+
+ if (ram_pa_max > ram_pa_end) {
+ pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
+ ram_pa_max, ram_pa_end);
+ goto e_unmap;
+ }
+
+ /* Adjust the maximum index based on the found segments */
+ rst_max_index = max_index + 1;
+
+ memunmap(rst);
+
+ return true;
+
+e_unmap:
+ memunmap(rst);
+
+e_free:
+ free_rmp_segment_table();
+
+ return false;
+}
+
+static bool __init setup_rmptable(void)
+{
+ if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
+ return setup_segmented_rmptable();
+ } else {
+ return setup_contiguous_rmptable();
+ }
+}
+
+/*
+ * Do the necessary preparations which are verified by the firmware as
+ * described in the SNP_INIT_EX firmware command description in the SNP
+ * firmware ABI spec.
+ */
+int __init snp_rmptable_init(void)
+{
+ unsigned int i;
+ u64 val;
+
+ if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP)))
+ return -ENOSYS;
+
+ if (WARN_ON_ONCE(!amd_iommu_snp_en))
+ return -ENOSYS;
+
+ if (!setup_rmptable())
+ return -ENOSYS;
+
+ /*
+ * Check if SEV-SNP is already enabled, this can happen in case of
+ * kexec boot.
+ */
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+ if (val & MSR_AMD64_SYSCFG_SNP_EN)
+ goto skip_enable;
+
+ /* Zero out the RMP bookkeeping area */
+ if (!clear_rmptable_bookkeeping()) {
+ free_rmp_segment_table();
+ return -ENOSYS;
+ }
+
+ /* Zero out the RMP entries */
+ for (i = 0; i < rst_max_index; i++) {
+ struct rmp_segment_desc *desc;
+
+ desc = rmp_segment_table[i];
+ if (!desc)
+ continue;
+
+ memset(desc->rmp_entry, 0, desc->size);
+ }
+
+ /* Flush the caches to ensure that data is written before SNP is enabled. */
+ wbinvd_on_all_cpus();
+
+ /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
+ on_each_cpu(mfd_enable, NULL, 1);
+
+ on_each_cpu(snp_enable, NULL, 1);
+
+skip_enable:
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
+
+ /*
+ * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
+ * notifier is invoked to do SNP IOMMU shutdown before kdump.
+ */
+ crash_kexec_post_notifiers = true;
+
+ return 0;
+}
+
+static void set_rmp_segment_info(unsigned int segment_shift)
+{
+ rmp_segment_shift = segment_shift;
+ rmp_segment_size = 1ULL << rmp_segment_shift;
+ rmp_segment_mask = rmp_segment_size - 1;
+}
+
+#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
+
+static bool probe_contiguous_rmptable_info(void)
+{
+ u64 rmp_sz, rmp_base, rmp_end;
+
+ rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
+ rdmsrl(MSR_AMD64_RMP_END, rmp_end);
+
+ if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
+ pr_err("Memory for the RMP table has not been reserved by BIOS\n");
+ return false;
+ }
+
+ if (rmp_base > rmp_end) {
+ pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
+ return false;
+ }
+
+ rmp_sz = rmp_end - rmp_base + 1;
+
+ /* Treat the contiguous RMP table as a single segment */
+ rst_max_index = 1;
+
+ set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT);
+
+ probed_rmp_base = rmp_base;
+ probed_rmp_size = rmp_sz;
+
+ pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
+ rmp_base, rmp_end);
+
+ return true;
+}
+
+static bool probe_segmented_rmptable_info(void)
+{
+ unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max;
+ u64 rmp_base, rmp_end;
+
+ rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
+ if (!(rmp_base & RMP_ADDR_MASK)) {
+ pr_err("Memory for the RMP table has not been reserved by BIOS\n");
+ return false;
+ }
+
+ rdmsrl(MSR_AMD64_RMP_END, rmp_end);
+ WARN_ONCE(rmp_end & RMP_ADDR_MASK,
+ "Segmented RMP enabled but RMP_END MSR is non-zero\n");
+
+ /* Obtain the min and max supported RMP segment size */
+ eax = cpuid_eax(0x80000025);
+ segment_shift_min = eax & GENMASK(5, 0);
+ segment_shift_max = (eax & GENMASK(11, 6)) >> 6;
+
+ /* Verify the segment size is within the supported limits */
+ segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg);
+ if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) {
+ pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n",
+ segment_shift, segment_shift_min, segment_shift_max);
+ return false;
+ }
+
+ /* Override the max supported RST index if a hardware limit exists */
+ ebx = cpuid_ebx(0x80000025);
+ if (ebx & BIT(10))
+ rst_max_index = ebx & GENMASK(9, 0);
+
+ set_rmp_segment_info(segment_shift);
+
+ probed_rmp_base = rmp_base;
+ probed_rmp_size = 0;
+
+ pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n",
+ rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE);
+
+ return true;
+}
+
+bool snp_probe_rmptable_info(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP))
+ rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg);
+
+ if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)
+ return probe_segmented_rmptable_info();
+ else
+ return probe_contiguous_rmptable_info();
+}
+
+/*
+ * About the array_index_nospec() usage below:
+ *
+ * This function can get called by exported functions like
+ * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among
+ * others, and since the @pfn passed in cannot always be trusted,
+ * speculation should be stopped as a protective measure.
+ */
+static struct rmpentry_raw *get_raw_rmpentry(u64 pfn)
+{
+ u64 paddr, rst_index, segment_index;
+ struct rmp_segment_desc *desc;
+
+ if (!rmp_segment_table)
+ return ERR_PTR(-ENODEV);
+
+ paddr = pfn << PAGE_SHIFT;
+
+ rst_index = RST_ENTRY_INDEX(paddr);
+ if (unlikely(rst_index >= rst_max_index))
+ return ERR_PTR(-EFAULT);
+
+ rst_index = array_index_nospec(rst_index, rst_max_index);
+
+ desc = rmp_segment_table[rst_index];
+ if (unlikely(!desc))
+ return ERR_PTR(-EFAULT);
+
+ segment_index = RMP_ENTRY_INDEX(paddr);
+ if (unlikely(segment_index >= desc->max_index))
+ return ERR_PTR(-EFAULT);
+
+ segment_index = array_index_nospec(segment_index, desc->max_index);
+
+ return desc->rmp_entry + segment_index;
+}
+
+static int get_rmpentry(u64 pfn, struct rmpentry *e)
+{
+ struct rmpentry_raw *e_raw;
+
+ if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) {
+ int ret;
+
+ /* Binutils version 2.44 supports the RMPREAD mnemonic. */
+ asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd"
+ : "=a" (ret)
+ : "a" (pfn << PAGE_SHIFT), "c" (e)
+ : "memory", "cc");
+
+ return ret;
+ }
+
+ e_raw = get_raw_rmpentry(pfn);
+ if (IS_ERR(e_raw))
+ return PTR_ERR(e_raw);
+
+ /*
+ * Map the raw RMP table entry onto the RMPREAD output format.
+ * The 2MB region status indicator (hpage_region_status field) is not
+ * calculated, since the overhead could be significant and the field
+ * is not used.
+ */
+ memset(e, 0, sizeof(*e));
+ e->gpa = e_raw->gpa << PAGE_SHIFT;
+ e->asid = e_raw->asid;
+ e->assigned = e_raw->assigned;
+ e->pagesize = e_raw->pagesize;
+ e->immutable = e_raw->immutable;
+
+ return 0;
+}
+
+static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level)
+{
+ struct rmpentry e_large;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return -ENODEV;
+
+ ret = get_rmpentry(pfn, e);
+ if (ret)
+ return ret;
+
+ /*
+ * Find the authoritative RMP entry for a PFN. This can be either a 4K
+ * RMP entry or a special large RMP entry that is authoritative for a
+ * whole 2M area.
+ */
+ ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large);
+ if (ret)
+ return ret;
+
+ *level = RMP_TO_PG_LEVEL(e_large.pagesize);
+
+ return 0;
+}
+
+int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
+{
+ struct rmpentry e;
+ int ret;
+
+ ret = __snp_lookup_rmpentry(pfn, &e, level);
+ if (ret)
+ return ret;
+
+ *assigned = !!e.assigned;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
+
+/*
+ * Dump the raw RMP entry for a particular PFN. These bits are documented in the
+ * PPR for a particular CPU model and provide useful information about how a
+ * particular PFN is being utilized by the kernel/firmware at the time certain
+ * unexpected events occur, such as RMP faults.
+ */
+static void dump_rmpentry(u64 pfn)
+{
+ struct rmpentry_raw *e_raw;
+ u64 pfn_i, pfn_end;
+ struct rmpentry e;
+ int level, ret;
+
+ ret = __snp_lookup_rmpentry(pfn, &e, &level);
+ if (ret) {
+ pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n",
+ pfn, ret);
+ return;
+ }
+
+ if (e.assigned) {
+ e_raw = get_raw_rmpentry(pfn);
+ if (IS_ERR(e_raw)) {
+ pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n",
+ pfn, PTR_ERR(e_raw));
+ return;
+ }
+
+ pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
+ pfn, e_raw->lo, e_raw->hi);
+ return;
+ }
+
+ /*
+ * If the RMP entry for a particular PFN is not in an assigned state,
+ * then it is sometimes useful to get an idea of whether or not any RMP
+ * entries for other PFNs within the same 2MB region are assigned, since
+ * those too can affect the ability to access a particular PFN in
+ * certain situations, such as when the PFN is being accessed via a 2MB
+ * mapping in the host page table.
+ */
+ pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ pfn_end = pfn_i + PTRS_PER_PMD;
+
+ pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
+ pfn, pfn_i, pfn_end);
+
+ while (pfn_i < pfn_end) {
+ e_raw = get_raw_rmpentry(pfn_i);
+ if (IS_ERR(e_raw)) {
+ pr_err("Error %ld reading RMP contents for PFN 0x%llx\n",
+ PTR_ERR(e_raw), pfn_i);
+ pfn_i++;
+ continue;
+ }
+
+ if (e_raw->lo || e_raw->hi)
+ pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi);
+ pfn_i++;
+ }
+}
+
+void snp_dump_hva_rmpentry(unsigned long hva)
+{
+ unsigned long paddr;
+ unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd += pgd_index(hva);
+ pte = lookup_address_in_pgd(pgd, hva, &level);
+
+ if (!pte) {
+ pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
+ return;
+ }
+
+ paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
+ dump_rmpentry(PHYS_PFN(paddr));
+}
+
+/*
+ * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
+ * Validated bit.
+ */
+int psmash(u64 pfn)
+{
+ unsigned long paddr = pfn << PAGE_SHIFT;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return -ENODEV;
+
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+
+ /* Binutils version 2.36 supports the PSMASH mnemonic. */
+ asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
+ : "=a" (ret)
+ : "a" (paddr)
+ : "memory", "cc");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(psmash);
+
+/*
+ * If the kernel uses a 2MB or larger directmap mapping to write to an address,
+ * and that mapping contains any 4KB pages that are set to private in the RMP
+ * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
+ * owns the PFNs being transitioned will never attempt such a write, but other
+ * kernel tasks writing to other PFNs in the range may trigger these checks
+ * inadvertently due a large directmap mapping that happens to overlap such a
+ * PFN.
+ *
+ * Prevent this by splitting any 2MB+ mappings that might end up containing a
+ * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
+ * PFN/rmp_level passed in.
+ *
+ * Note that there is no attempt here to scan all the RMP entries for the 2MB
+ * physical range, since it would only be worthwhile in determining if a
+ * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
+ * the same shared/private state, thus avoiding the need to split the mapping.
+ * But that would mean the entries are currently in a mixed state, and so the
+ * mapping would have already been split as a result of prior transitions.
+ * And since the 4K split is only done if the mapping is 2MB+, and there isn't
+ * currently a mechanism in place to restore 2MB+ mappings, such a check would
+ * not provide any usable benefit.
+ *
+ * More specifics on how these checks are carried out can be found in APM
+ * Volume 2, "RMP and VMPL Access Checks".
+ */
+static int adjust_direct_map(u64 pfn, int rmp_level)
+{
+ unsigned long vaddr;
+ unsigned int level;
+ int npages, ret;
+ pte_t *pte;
+
+ /*
+ * pfn_to_kaddr() will return a vaddr only within the direct
+ * map range.
+ */
+ vaddr = (unsigned long)pfn_to_kaddr(pfn);
+
+ /* Only 4KB/2MB RMP entries are supported by current hardware. */
+ if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
+ return -EINVAL;
+
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+
+ if (rmp_level == PG_LEVEL_2M &&
+ (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
+ return -EINVAL;
+
+ /*
+ * If an entire 2MB physical range is being transitioned, then there is
+ * no risk of RMP #PFs due to write accesses from overlapping mappings,
+ * since even accesses from 1GB mappings will be treated as 2MB accesses
+ * as far as RMP table checks are concerned.
+ */
+ if (rmp_level == PG_LEVEL_2M)
+ return 0;
+
+ pte = lookup_address(vaddr, &level);
+ if (!pte || pte_none(*pte))
+ return 0;
+
+ if (level == PG_LEVEL_4K)
+ return 0;
+
+ npages = page_level_size(rmp_level) / PAGE_SIZE;
+ ret = set_memory_4k(vaddr, npages);
+ if (ret)
+ pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
+ pfn, ret);
+
+ return ret;
+}
+
+/*
+ * It is expected that those operations are seldom enough so that no mutual
+ * exclusion of updaters is needed and thus the overlap error condition below
+ * should happen very rarely and would get resolved relatively quickly by
+ * the firmware.
+ *
+ * If not, one could consider introducing a mutex or so here to sync concurrent
+ * RMP updates and thus diminish the amount of cases where firmware needs to
+ * lock 2M ranges to protect against concurrent updates.
+ *
+ * The optimal solution would be range locking to avoid locking disjoint
+ * regions unnecessarily but there's no support for that yet.
+ */
+static int rmpupdate(u64 pfn, struct rmp_state *state)
+{
+ unsigned long paddr = pfn << PAGE_SHIFT;
+ int ret, level;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return -ENODEV;
+
+ level = RMP_TO_PG_LEVEL(state->pagesize);
+
+ if (adjust_direct_map(pfn, level))
+ return -EFAULT;
+
+ do {
+ /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
+ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
+ : "=a" (ret)
+ : "a" (paddr), "c" ((unsigned long)state)
+ : "memory", "cc");
+ } while (ret == RMPUPDATE_FAIL_OVERLAP);
+
+ if (ret) {
+ pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
+ pfn, level, ret);
+ dump_rmpentry(pfn);
+ dump_stack();
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/* Transition a page to guest-owned/private state in the RMP table. */
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
+{
+ struct rmp_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.assigned = 1;
+ state.asid = asid;
+ state.immutable = immutable;
+ state.gpa = gpa;
+ state.pagesize = PG_LEVEL_TO_RMP(level);
+
+ return rmpupdate(pfn, &state);
+}
+EXPORT_SYMBOL_GPL(rmp_make_private);
+
+/* Transition a page to hypervisor-owned/shared state in the RMP table. */
+int rmp_make_shared(u64 pfn, enum pg_level level)
+{
+ struct rmp_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.pagesize = PG_LEVEL_TO_RMP(level);
+
+ return rmpupdate(pfn, &state);
+}
+EXPORT_SYMBOL_GPL(rmp_make_shared);
+
+void snp_leak_pages(u64 pfn, unsigned int npages)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
+
+ spin_lock(&snp_leaked_pages_list_lock);
+ while (npages--) {
+
+ /*
+ * Reuse the page's buddy list for chaining into the leaked
+ * pages list. This page should not be on a free list currently
+ * and is also unsafe to be added to a free list.
+ */
+ if (likely(!PageCompound(page)) ||
+
+ /*
+ * Skip inserting tail pages of compound page as
+ * page->buddy_list of tail pages is not usable.
+ */
+ (PageHead(page) && compound_nr(page) <= npages))
+ list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
+
+ dump_rmpentry(pfn);
+ snp_nr_leaked_pages++;
+ pfn++;
+ page++;
+ }
+ spin_unlock(&snp_leaked_pages_list_lock);
+}
+EXPORT_SYMBOL_GPL(snp_leak_pages);
+
+void kdump_sev_callback(void)
+{
+ /*
+ * Do wbinvd() on remote CPUs when SNP is enabled in order to
+ * safely do SNP_SHUTDOWN on the local CPU.
+ */
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ wbinvd();
+}
diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile
index 46ef8f73aebb..90da47eb85ee 100644
--- a/arch/x86/virt/vmx/tdx/Makefile
+++ b/arch/x86/virt/vmx/tdx/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += seamcall.o
+obj-y += seamcall.o tdx.o
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
new file mode 100644
index 000000000000..7fdb37387886
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -0,0 +1,1458 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright(c) 2023 Intel Corporation.
+ *
+ * Intel Trusted Domain Extensions (TDX) support
+ */
+
+#define pr_fmt(fmt) "virt/tdx: " fmt
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/cpu.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-defs.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/pfn.h>
+#include <linux/align.h>
+#include <linux/sort.h>
+#include <linux/log2.h>
+#include <linux/acpi.h>
+#include <linux/suspend.h>
+#include <asm/page.h>
+#include <asm/special_insns.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/cpufeature.h>
+#include <asm/tdx.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/mce.h>
+#include "tdx.h"
+
+static u32 tdx_global_keyid __ro_after_init;
+static u32 tdx_guest_keyid_start __ro_after_init;
+static u32 tdx_nr_guest_keyids __ro_after_init;
+
+static DEFINE_PER_CPU(bool, tdx_lp_initialized);
+
+static struct tdmr_info_list tdx_tdmr_list;
+
+static enum tdx_module_status_t tdx_module_status;
+static DEFINE_MUTEX(tdx_module_lock);
+
+/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
+static LIST_HEAD(tdx_memlist);
+
+typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
+
+static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
+{
+ pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
+}
+
+static inline void seamcall_err_ret(u64 fn, u64 err,
+ struct tdx_module_args *args)
+{
+ seamcall_err(fn, err, args);
+ pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
+ args->rcx, args->rdx, args->r8);
+ pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
+ args->r9, args->r10, args->r11);
+}
+
+static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
+{
+ u64 sret = sc_retry(func, fn, args);
+
+ if (sret == TDX_SUCCESS)
+ return 0;
+
+ if (sret == TDX_SEAMCALL_VMFAILINVALID)
+ return -ENODEV;
+
+ if (sret == TDX_SEAMCALL_GP)
+ return -EOPNOTSUPP;
+
+ if (sret == TDX_SEAMCALL_UD)
+ return -EACCES;
+
+ err_func(fn, sret, args);
+ return -EIO;
+}
+
+#define seamcall_prerr(__fn, __args) \
+ sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
+
+#define seamcall_prerr_ret(__fn, __args) \
+ sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
+
+/*
+ * Do the module global initialization once and return its result.
+ * It can be done on any cpu. It's always called with interrupts
+ * disabled.
+ */
+static int try_init_module_global(void)
+{
+ struct tdx_module_args args = {};
+ static DEFINE_RAW_SPINLOCK(sysinit_lock);
+ static bool sysinit_done;
+ static int sysinit_ret;
+
+ lockdep_assert_irqs_disabled();
+
+ raw_spin_lock(&sysinit_lock);
+
+ if (sysinit_done)
+ goto out;
+
+ /* RCX is module attributes and all bits are reserved */
+ args.rcx = 0;
+ sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+
+ /*
+ * The first SEAMCALL also detects the TDX module, thus
+ * it can fail due to the TDX module is not loaded.
+ * Dump message to let the user know.
+ */
+ if (sysinit_ret == -ENODEV)
+ pr_err("module not loaded\n");
+
+ sysinit_done = true;
+out:
+ raw_spin_unlock(&sysinit_lock);
+ return sysinit_ret;
+}
+
+/**
+ * tdx_cpu_enable - Enable TDX on local cpu
+ *
+ * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
+ * global initialization SEAMCALL if not done) on local cpu to make this
+ * cpu be ready to run any other SEAMCALLs.
+ *
+ * Always call this function via IPI function calls.
+ *
+ * Return 0 on success, otherwise errors.
+ */
+int tdx_cpu_enable(void)
+{
+ struct tdx_module_args args = {};
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return -ENODEV;
+
+ lockdep_assert_irqs_disabled();
+
+ if (__this_cpu_read(tdx_lp_initialized))
+ return 0;
+
+ /*
+ * The TDX module global initialization is the very first step
+ * to enable TDX. Need to do it first (if hasn't been done)
+ * before the per-cpu initialization.
+ */
+ ret = try_init_module_global();
+ if (ret)
+ return ret;
+
+ ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
+ if (ret)
+ return ret;
+
+ __this_cpu_write(tdx_lp_initialized, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+
+/*
+ * Add a memory region as a TDX memory block. The caller must make sure
+ * all memory regions are added in address ascending order and don't
+ * overlap.
+ */
+static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
+ unsigned long end_pfn, int nid)
+{
+ struct tdx_memblock *tmb;
+
+ tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
+ if (!tmb)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&tmb->list);
+ tmb->start_pfn = start_pfn;
+ tmb->end_pfn = end_pfn;
+ tmb->nid = nid;
+
+ /* @tmb_list is protected by mem_hotplug_lock */
+ list_add_tail(&tmb->list, tmb_list);
+ return 0;
+}
+
+static void free_tdx_memlist(struct list_head *tmb_list)
+{
+ /* @tmb_list is protected by mem_hotplug_lock */
+ while (!list_empty(tmb_list)) {
+ struct tdx_memblock *tmb = list_first_entry(tmb_list,
+ struct tdx_memblock, list);
+
+ list_del(&tmb->list);
+ kfree(tmb);
+ }
+}
+
+/*
+ * Ensure that all memblock memory regions are convertible to TDX
+ * memory. Once this has been established, stash the memblock
+ * ranges off in a secondary structure because memblock is modified
+ * in memory hotplug while TDX memory regions are fixed.
+ */
+static int build_tdx_memlist(struct list_head *tmb_list)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid, ret;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ /*
+ * The first 1MB is not reported as TDX convertible memory.
+ * Although the first 1MB is always reserved and won't end up
+ * to the page allocator, it is still in memblock's memory
+ * regions. Skip them manually to exclude them as TDX memory.
+ */
+ start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /*
+ * Add the memory regions as TDX memory. The regions in
+ * memblock has already guaranteed they are in address
+ * ascending order and don't overlap.
+ */
+ ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ free_tdx_memlist(tmb_list);
+ return ret;
+}
+
+static int read_sys_metadata_field(u64 field_id, u64 *data)
+{
+ struct tdx_module_args args = {};
+ int ret;
+
+ /*
+ * TDH.SYS.RD -- reads one global metadata field
+ * - RDX (in): the field to read
+ * - R8 (out): the field data
+ */
+ args.rdx = field_id;
+ ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
+ if (ret)
+ return ret;
+
+ *data = args.r8;
+
+ return 0;
+}
+
+#include "tdx_global_metadata.c"
+
+static int check_features(struct tdx_sys_info *sysinfo)
+{
+ u64 tdx_features0 = sysinfo->features.tdx_features0;
+
+ if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
+ pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Calculate the actual TDMR size */
+static int tdmr_size_single(u16 max_reserved_per_tdmr)
+{
+ int tdmr_sz;
+
+ /*
+ * The actual size of TDMR depends on the maximum
+ * number of reserved areas.
+ */
+ tdmr_sz = sizeof(struct tdmr_info);
+ tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
+
+ return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
+}
+
+static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
+ struct tdx_sys_info_tdmr *sysinfo_tdmr)
+{
+ size_t tdmr_sz, tdmr_array_sz;
+ void *tdmr_array;
+
+ tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
+ tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
+
+ /*
+ * To keep things simple, allocate all TDMRs together.
+ * The buffer needs to be physically contiguous to make
+ * sure each TDMR is physically contiguous.
+ */
+ tdmr_array = alloc_pages_exact(tdmr_array_sz,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tdmr_array)
+ return -ENOMEM;
+
+ tdmr_list->tdmrs = tdmr_array;
+
+ /*
+ * Keep the size of TDMR to find the target TDMR
+ * at a given index in the TDMR list.
+ */
+ tdmr_list->tdmr_sz = tdmr_sz;
+ tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
+ tdmr_list->nr_consumed_tdmrs = 0;
+
+ return 0;
+}
+
+static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
+{
+ free_pages_exact(tdmr_list->tdmrs,
+ tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
+}
+
+/* Get the TDMR from the list at the given index. */
+static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
+ int idx)
+{
+ int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
+
+ return (void *)tdmr_list->tdmrs + tdmr_info_offset;
+}
+
+#define TDMR_ALIGNMENT SZ_1G
+#define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
+#define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
+
+static inline u64 tdmr_end(struct tdmr_info *tdmr)
+{
+ return tdmr->base + tdmr->size;
+}
+
+/*
+ * Take the memory referenced in @tmb_list and populate the
+ * preallocated @tdmr_list, following all the special alignment
+ * and size rules for TDMR.
+ */
+static int fill_out_tdmrs(struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list)
+{
+ struct tdx_memblock *tmb;
+ int tdmr_idx = 0;
+
+ /*
+ * Loop over TDX memory regions and fill out TDMRs to cover them.
+ * To keep it simple, always try to use one TDMR to cover one
+ * memory region.
+ *
+ * In practice TDX supports at least 64 TDMRs. A 2-socket system
+ * typically only consumes less than 10 of those. This code is
+ * dumb and simple and may use more TMDRs than is strictly
+ * required.
+ */
+ list_for_each_entry(tmb, tmb_list, list) {
+ struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
+ u64 start, end;
+
+ start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
+ end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
+
+ /*
+ * A valid size indicates the current TDMR has already
+ * been filled out to cover the previous memory region(s).
+ */
+ if (tdmr->size) {
+ /*
+ * Loop to the next if the current memory region
+ * has already been fully covered.
+ */
+ if (end <= tdmr_end(tdmr))
+ continue;
+
+ /* Otherwise, skip the already covered part. */
+ if (start < tdmr_end(tdmr))
+ start = tdmr_end(tdmr);
+
+ /*
+ * Create a new TDMR to cover the current memory
+ * region, or the remaining part of it.
+ */
+ tdmr_idx++;
+ if (tdmr_idx >= tdmr_list->max_tdmrs) {
+ pr_warn("initialization failed: TDMRs exhausted.\n");
+ return -ENOSPC;
+ }
+
+ tdmr = tdmr_entry(tdmr_list, tdmr_idx);
+ }
+
+ tdmr->base = start;
+ tdmr->size = end - start;
+ }
+
+ /* @tdmr_idx is always the index of the last valid TDMR. */
+ tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
+
+ /*
+ * Warn early that kernel is about to run out of TDMRs.
+ *
+ * This is an indication that TDMR allocation has to be
+ * reworked to be smarter to not run into an issue.
+ */
+ if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
+ pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
+ tdmr_list->nr_consumed_tdmrs,
+ tdmr_list->max_tdmrs);
+
+ return 0;
+}
+
+/*
+ * Calculate PAMT size given a TDMR and a page size. The returned
+ * PAMT size is always aligned up to 4K page boundary.
+ */
+static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
+ u16 pamt_entry_size)
+{
+ unsigned long pamt_sz, nr_pamt_entries;
+
+ switch (pgsz) {
+ case TDX_PS_4K:
+ nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
+ break;
+ case TDX_PS_2M:
+ nr_pamt_entries = tdmr->size >> PMD_SHIFT;
+ break;
+ case TDX_PS_1G:
+ nr_pamt_entries = tdmr->size >> PUD_SHIFT;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ pamt_sz = nr_pamt_entries * pamt_entry_size;
+ /* TDX requires PAMT size must be 4K aligned */
+ pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
+
+ return pamt_sz;
+}
+
+/*
+ * Locate a NUMA node which should hold the allocation of the @tdmr
+ * PAMT. This node will have some memory covered by the TDMR. The
+ * relative amount of memory covered is not considered.
+ */
+static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
+{
+ struct tdx_memblock *tmb;
+
+ /*
+ * A TDMR must cover at least part of one TMB. That TMB will end
+ * after the TDMR begins. But, that TMB may have started before
+ * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
+ * begins. Ignore 'tmb' start addresses. They are irrelevant.
+ */
+ list_for_each_entry(tmb, tmb_list, list) {
+ if (tmb->end_pfn > PHYS_PFN(tdmr->base))
+ return tmb->nid;
+ }
+
+ /*
+ * Fall back to allocating the TDMR's metadata from node 0 when
+ * no TDX memory block can be found. This should never happen
+ * since TDMRs originate from TDX memory blocks.
+ */
+ pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
+ tdmr->base, tdmr_end(tdmr));
+ return 0;
+}
+
+/*
+ * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
+ * within @tdmr, and set up PAMTs for @tdmr.
+ */
+static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
+ struct list_head *tmb_list,
+ u16 pamt_entry_size[])
+{
+ unsigned long pamt_base[TDX_PS_NR];
+ unsigned long pamt_size[TDX_PS_NR];
+ unsigned long tdmr_pamt_base;
+ unsigned long tdmr_pamt_size;
+ struct page *pamt;
+ int pgsz, nid;
+
+ nid = tdmr_get_nid(tdmr, tmb_list);
+
+ /*
+ * Calculate the PAMT size for each TDX supported page size
+ * and the total PAMT size.
+ */
+ tdmr_pamt_size = 0;
+ for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
+ pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
+ pamt_entry_size[pgsz]);
+ tdmr_pamt_size += pamt_size[pgsz];
+ }
+
+ /*
+ * Allocate one chunk of physically contiguous memory for all
+ * PAMTs. This helps minimize the PAMT's use of reserved areas
+ * in overlapped TDMRs.
+ */
+ pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
+ nid, &node_online_map);
+ if (!pamt)
+ return -ENOMEM;
+
+ /*
+ * Break the contiguous allocation back up into the
+ * individual PAMTs for each page size.
+ */
+ tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
+ for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
+ pamt_base[pgsz] = tdmr_pamt_base;
+ tdmr_pamt_base += pamt_size[pgsz];
+ }
+
+ tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
+ tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
+ tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
+ tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
+ tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
+ tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
+
+ return 0;
+}
+
+static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
+ unsigned long *pamt_size)
+{
+ unsigned long pamt_bs, pamt_sz;
+
+ /*
+ * The PAMT was allocated in one contiguous unit. The 4K PAMT
+ * should always point to the beginning of that allocation.
+ */
+ pamt_bs = tdmr->pamt_4k_base;
+ pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
+
+ WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
+
+ *pamt_base = pamt_bs;
+ *pamt_size = pamt_sz;
+}
+
+static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
+ void (*pamt_func)(unsigned long base, unsigned long size))
+{
+ unsigned long pamt_base, pamt_size;
+
+ tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
+
+ /* Do nothing if PAMT hasn't been allocated for this TDMR */
+ if (!pamt_size)
+ return;
+
+ if (WARN_ON_ONCE(!pamt_base))
+ return;
+
+ pamt_func(pamt_base, pamt_size);
+}
+
+static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
+{
+ free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
+}
+
+static void tdmr_free_pamt(struct tdmr_info *tdmr)
+{
+ tdmr_do_pamt_func(tdmr, free_pamt);
+}
+
+static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_free_pamt(tdmr_entry(tdmr_list, i));
+}
+
+/* Allocate and set up PAMTs for all TDMRs */
+static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
+ struct list_head *tmb_list,
+ u16 pamt_entry_size[])
+{
+ int i, ret = 0;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
+ pamt_entry_size);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ tdmrs_free_pamt_all(tdmr_list);
+ return ret;
+}
+
+/*
+ * Convert TDX private pages back to normal by using MOVDIR64B to
+ * clear these pages. Note this function doesn't flush cache of
+ * these TDX private pages. The caller should make sure of that.
+ */
+static void reset_tdx_pages(unsigned long base, unsigned long size)
+{
+ const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
+ unsigned long phys, end;
+
+ end = base + size;
+ for (phys = base; phys < end; phys += 64)
+ movdir64b(__va(phys), zero_page);
+
+ /*
+ * MOVDIR64B uses WC protocol. Use memory barrier to
+ * make sure any later user of these pages sees the
+ * updated data.
+ */
+ mb();
+}
+
+static void tdmr_reset_pamt(struct tdmr_info *tdmr)
+{
+ tdmr_do_pamt_func(tdmr, reset_tdx_pages);
+}
+
+static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
+}
+
+static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
+{
+ unsigned long pamt_size = 0;
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ unsigned long base, size;
+
+ tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
+ pamt_size += size;
+ }
+
+ return pamt_size / 1024;
+}
+
+static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
+ u64 size, u16 max_reserved_per_tdmr)
+{
+ struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
+ int idx = *p_idx;
+
+ /* Reserved area must be 4K aligned in offset and size */
+ if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
+ return -EINVAL;
+
+ if (idx >= max_reserved_per_tdmr) {
+ pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
+ tdmr->base, tdmr_end(tdmr));
+ return -ENOSPC;
+ }
+
+ /*
+ * Consume one reserved area per call. Make no effort to
+ * optimize or reduce the number of reserved areas which are
+ * consumed by contiguous reserved areas, for instance.
+ */
+ rsvd_areas[idx].offset = addr - tdmr->base;
+ rsvd_areas[idx].size = size;
+
+ *p_idx = idx + 1;
+
+ return 0;
+}
+
+/*
+ * Go through @tmb_list to find holes between memory areas. If any of
+ * those holes fall within @tdmr, set up a TDMR reserved area to cover
+ * the hole.
+ */
+static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
+ struct tdmr_info *tdmr,
+ int *rsvd_idx,
+ u16 max_reserved_per_tdmr)
+{
+ struct tdx_memblock *tmb;
+ u64 prev_end;
+ int ret;
+
+ /*
+ * Start looking for reserved blocks at the
+ * beginning of the TDMR.
+ */
+ prev_end = tdmr->base;
+ list_for_each_entry(tmb, tmb_list, list) {
+ u64 start, end;
+
+ start = PFN_PHYS(tmb->start_pfn);
+ end = PFN_PHYS(tmb->end_pfn);
+
+ /* Break if this region is after the TDMR */
+ if (start >= tdmr_end(tdmr))
+ break;
+
+ /* Exclude regions before this TDMR */
+ if (end < tdmr->base)
+ continue;
+
+ /*
+ * Skip over memory areas that
+ * have already been dealt with.
+ */
+ if (start <= prev_end) {
+ prev_end = end;
+ continue;
+ }
+
+ /* Add the hole before this region */
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
+ start - prev_end,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ prev_end = end;
+ }
+
+ /* Add the hole after the last region if it exists. */
+ if (prev_end < tdmr_end(tdmr)) {
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
+ tdmr_end(tdmr) - prev_end,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
+ * overlaps with @tdmr, set up a TDMR reserved area to cover the
+ * overlapping part.
+ */
+static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
+ struct tdmr_info *tdmr,
+ int *rsvd_idx,
+ u16 max_reserved_per_tdmr)
+{
+ int i, ret;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
+ unsigned long pamt_base, pamt_size, pamt_end;
+
+ tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
+ /* Each TDMR must already have PAMT allocated */
+ WARN_ON_ONCE(!pamt_size || !pamt_base);
+
+ pamt_end = pamt_base + pamt_size;
+ /* Skip PAMTs outside of the given TDMR */
+ if ((pamt_end <= tdmr->base) ||
+ (pamt_base >= tdmr_end(tdmr)))
+ continue;
+
+ /* Only mark the part within the TDMR as reserved */
+ if (pamt_base < tdmr->base)
+ pamt_base = tdmr->base;
+ if (pamt_end > tdmr_end(tdmr))
+ pamt_end = tdmr_end(tdmr);
+
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
+ pamt_end - pamt_base,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Compare function called by sort() for TDMR reserved areas */
+static int rsvd_area_cmp_func(const void *a, const void *b)
+{
+ struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
+ struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
+
+ if (r1->offset + r1->size <= r2->offset)
+ return -1;
+ if (r1->offset >= r2->offset + r2->size)
+ return 1;
+
+ /* Reserved areas cannot overlap. The caller must guarantee. */
+ WARN_ON_ONCE(1);
+ return -1;
+}
+
+/*
+ * Populate reserved areas for the given @tdmr, including memory holes
+ * (via @tmb_list) and PAMTs (via @tdmr_list).
+ */
+static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
+ struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list,
+ u16 max_reserved_per_tdmr)
+{
+ int ret, rsvd_idx = 0;
+
+ ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ /* TDX requires reserved areas listed in address ascending order */
+ sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
+ rsvd_area_cmp_func, NULL);
+
+ return 0;
+}
+
+/*
+ * Populate reserved areas for all TDMRs in @tdmr_list, including memory
+ * holes (via @tmb_list) and PAMTs.
+ */
+static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
+ struct list_head *tmb_list,
+ u16 max_reserved_per_tdmr)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ int ret;
+
+ ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
+ tmb_list, tdmr_list, max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Construct a list of TDMRs on the preallocated space in @tdmr_list
+ * to cover all TDX memory regions in @tmb_list based on the TDX module
+ * TDMR global information in @sysinfo_tdmr.
+ */
+static int construct_tdmrs(struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list,
+ struct tdx_sys_info_tdmr *sysinfo_tdmr)
+{
+ u16 pamt_entry_size[TDX_PS_NR] = {
+ sysinfo_tdmr->pamt_4k_entry_size,
+ sysinfo_tdmr->pamt_2m_entry_size,
+ sysinfo_tdmr->pamt_1g_entry_size,
+ };
+ int ret;
+
+ ret = fill_out_tdmrs(tmb_list, tdmr_list);
+ if (ret)
+ return ret;
+
+ ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
+ if (ret)
+ return ret;
+
+ ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
+ sysinfo_tdmr->max_reserved_per_tdmr);
+ if (ret)
+ tdmrs_free_pamt_all(tdmr_list);
+
+ /*
+ * The tdmr_info_list is read-only from here on out.
+ * Ensure that these writes are seen by other CPUs.
+ * Pairs with a smp_rmb() in is_pamt_page().
+ */
+ smp_wmb();
+
+ return ret;
+}
+
+static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
+{
+ struct tdx_module_args args = {};
+ u64 *tdmr_pa_array;
+ size_t array_sz;
+ int i, ret;
+
+ /*
+ * TDMRs are passed to the TDX module via an array of physical
+ * addresses of each TDMR. The array itself also has certain
+ * alignment requirement.
+ */
+ array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
+ array_sz = roundup_pow_of_two(array_sz);
+ if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
+ array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
+
+ tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
+ if (!tdmr_pa_array)
+ return -ENOMEM;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
+
+ args.rcx = __pa(tdmr_pa_array);
+ args.rdx = tdmr_list->nr_consumed_tdmrs;
+ args.r8 = global_keyid;
+ ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
+
+ /* Free the array as it is not required anymore. */
+ kfree(tdmr_pa_array);
+
+ return ret;
+}
+
+static int do_global_key_config(void *unused)
+{
+ struct tdx_module_args args = {};
+
+ return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
+}
+
+/*
+ * Attempt to configure the global KeyID on all physical packages.
+ *
+ * This requires running code on at least one CPU in each package.
+ * TDMR initialization) will fail will fail if any package in the
+ * system has no online CPUs.
+ *
+ * This code takes no affirmative steps to online CPUs. Callers (aka.
+ * KVM) can ensure success by ensuring sufficient CPUs are online and
+ * can run SEAMCALLs.
+ */
+static int config_global_keyid(void)
+{
+ cpumask_var_t packages;
+ int cpu, ret = -EINVAL;
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * Hardware doesn't guarantee cache coherency across different
+ * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
+ * (associated with KeyID 0) before the TDX module can use the
+ * global KeyID to access the PAMT. Given PAMTs are potentially
+ * large (~1/256th of system RAM), just use WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ for_each_online_cpu(cpu) {
+ /*
+ * The key configuration only needs to be done once per
+ * package and will return an error if configured more
+ * than once. Avoid doing it multiple times per package.
+ */
+ if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
+ packages))
+ continue;
+
+ /*
+ * TDH.SYS.KEY.CONFIG cannot run concurrently on
+ * different cpus. Do it one by one.
+ */
+ ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
+ if (ret)
+ break;
+ }
+
+ free_cpumask_var(packages);
+ return ret;
+}
+
+static int init_tdmr(struct tdmr_info *tdmr)
+{
+ u64 next;
+
+ /*
+ * Initializing a TDMR can be time consuming. To avoid long
+ * SEAMCALLs, the TDX module may only initialize a part of the
+ * TDMR in each call.
+ */
+ do {
+ struct tdx_module_args args = {
+ .rcx = tdmr->base,
+ };
+ int ret;
+
+ ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
+ if (ret)
+ return ret;
+ /*
+ * RDX contains 'next-to-initialize' address if
+ * TDH.SYS.TDMR.INIT did not fully complete and
+ * should be retried.
+ */
+ next = args.rdx;
+ cond_resched();
+ /* Keep making SEAMCALLs until the TDMR is done */
+ } while (next < tdmr->base + tdmr->size);
+
+ return 0;
+}
+
+static int init_tdmrs(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ /*
+ * This operation is costly. It can be parallelized,
+ * but keep it simple for now.
+ */
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ int ret;
+
+ ret = init_tdmr(tdmr_entry(tdmr_list, i));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int init_tdx_module(void)
+{
+ struct tdx_sys_info sysinfo;
+ int ret;
+
+ ret = get_tdx_sys_info(&sysinfo);
+ if (ret)
+ return ret;
+
+ /* Check whether the kernel can support this module */
+ ret = check_features(&sysinfo);
+ if (ret)
+ return ret;
+
+ /*
+ * To keep things simple, assume that all TDX-protected memory
+ * will come from the page allocator. Make sure all pages in the
+ * page allocator are TDX-usable memory.
+ *
+ * Build the list of "TDX-usable" memory regions which cover all
+ * pages in the page allocator to guarantee that. Do it while
+ * holding mem_hotplug_lock read-lock as the memory hotplug code
+ * path reads the @tdx_memlist to reject any new memory.
+ */
+ get_online_mems();
+
+ ret = build_tdx_memlist(&tdx_memlist);
+ if (ret)
+ goto out_put_tdxmem;
+
+ /* Allocate enough space for constructing TDMRs */
+ ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr);
+ if (ret)
+ goto err_free_tdxmem;
+
+ /* Cover all TDX-usable memory regions in TDMRs */
+ ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr);
+ if (ret)
+ goto err_free_tdmrs;
+
+ /* Pass the TDMRs and the global KeyID to the TDX module */
+ ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
+ if (ret)
+ goto err_free_pamts;
+
+ /* Config the key of global KeyID on all packages */
+ ret = config_global_keyid();
+ if (ret)
+ goto err_reset_pamts;
+
+ /* Initialize TDMRs to complete the TDX module initialization */
+ ret = init_tdmrs(&tdx_tdmr_list);
+ if (ret)
+ goto err_reset_pamts;
+
+ pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
+
+out_put_tdxmem:
+ /*
+ * @tdx_memlist is written here and read at memory hotplug time.
+ * Lock out memory hotplug code while building it.
+ */
+ put_online_mems();
+ return ret;
+
+err_reset_pamts:
+ /*
+ * Part of PAMTs may already have been initialized by the
+ * TDX module. Flush cache before returning PAMTs back
+ * to the kernel.
+ */
+ wbinvd_on_all_cpus();
+ /*
+ * According to the TDX hardware spec, if the platform
+ * doesn't have the "partial write machine check"
+ * erratum, any kernel read/write will never cause #MC
+ * in kernel space, thus it's OK to not convert PAMTs
+ * back to normal. But do the conversion anyway here
+ * as suggested by the TDX spec.
+ */
+ tdmrs_reset_pamt_all(&tdx_tdmr_list);
+err_free_pamts:
+ tdmrs_free_pamt_all(&tdx_tdmr_list);
+err_free_tdmrs:
+ free_tdmr_list(&tdx_tdmr_list);
+err_free_tdxmem:
+ free_tdx_memlist(&tdx_memlist);
+ goto out_put_tdxmem;
+}
+
+static int __tdx_enable(void)
+{
+ int ret;
+
+ ret = init_tdx_module();
+ if (ret) {
+ pr_err("module initialization failed (%d)\n", ret);
+ tdx_module_status = TDX_MODULE_ERROR;
+ return ret;
+ }
+
+ pr_info("module initialized\n");
+ tdx_module_status = TDX_MODULE_INITIALIZED;
+
+ return 0;
+}
+
+/**
+ * tdx_enable - Enable TDX module to make it ready to run TDX guests
+ *
+ * This function assumes the caller has: 1) held read lock of CPU hotplug
+ * lock to prevent any new cpu from becoming online; 2) done both VMXON
+ * and tdx_cpu_enable() on all online cpus.
+ *
+ * This function requires there's at least one online cpu for each CPU
+ * package to succeed.
+ *
+ * This function can be called in parallel by multiple callers.
+ *
+ * Return 0 if TDX is enabled successfully, otherwise error.
+ */
+int tdx_enable(void)
+{
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return -ENODEV;
+
+ lockdep_assert_cpus_held();
+
+ mutex_lock(&tdx_module_lock);
+
+ switch (tdx_module_status) {
+ case TDX_MODULE_UNINITIALIZED:
+ ret = __tdx_enable();
+ break;
+ case TDX_MODULE_INITIALIZED:
+ /* Already initialized, great, tell the caller. */
+ ret = 0;
+ break;
+ default:
+ /* Failed to initialize in the previous attempts */
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tdx_module_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdx_enable);
+
+static bool is_pamt_page(unsigned long phys)
+{
+ struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
+ int i;
+
+ /* Ensure that all remote 'tdmr_list' writes are visible: */
+ smp_rmb();
+
+ /*
+ * The TDX module is no longer returning TDX_SYS_NOT_READY and
+ * is initialized. The 'tdmr_list' was initialized long ago
+ * and is now read-only.
+ */
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ unsigned long base, size;
+
+ tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
+
+ if (phys >= base && phys < (base + size))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return whether the memory page at the given physical address is TDX
+ * private memory or not.
+ *
+ * This can be imprecise for two known reasons:
+ * 1. PAMTs are private memory and exist before the TDX module is
+ * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
+ * short window that occurs once per boot.
+ * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
+ * page. However, the page can still cause #MC until it has been
+ * fully converted to shared using 64-byte writes like MOVDIR64B.
+ * Buggy hosts might still leave #MC-causing memory in place which
+ * this function can not detect.
+ */
+static bool paddr_is_tdx_private(unsigned long phys)
+{
+ struct tdx_module_args args = {
+ .rcx = phys & PAGE_MASK,
+ };
+ u64 sret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return false;
+
+ /* Get page type from the TDX module */
+ sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
+
+ /*
+ * The SEAMCALL will not return success unless there is a
+ * working, "ready" TDX module. Assume an absence of TDX
+ * private pages until SEAMCALL is working.
+ */
+ if (sret)
+ return false;
+
+ /*
+ * SEAMCALL was successful -- read page type (via RCX):
+ *
+ * - PT_NDA: Page is not used by the TDX module
+ * - PT_RSVD: Reserved for Non-TDX use
+ * - Others: Page is used by the TDX module
+ *
+ * Note PAMT pages are marked as PT_RSVD but they are also TDX
+ * private memory.
+ */
+ switch (args.rcx) {
+ case PT_NDA:
+ return false;
+ case PT_RSVD:
+ return is_pamt_page(phys);
+ default:
+ return true;
+ }
+}
+
+/*
+ * Some TDX-capable CPUs have an erratum. A write to TDX private
+ * memory poisons that memory, and a subsequent read of that memory
+ * triggers #MC.
+ *
+ * Help distinguish erratum-triggered #MCs from a normal hardware one.
+ * Just print additional message to show such #MC may be result of the
+ * erratum.
+ */
+const char *tdx_dump_mce_info(struct mce *m)
+{
+ if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
+ return NULL;
+
+ if (!paddr_is_tdx_private(m->addr))
+ return NULL;
+
+ return "TDX private memory error. Possible kernel bug.";
+}
+
+static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
+ u32 *nr_tdx_keyids)
+{
+ u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
+ int ret;
+
+ /*
+ * IA32_MKTME_KEYID_PARTIONING:
+ * Bit [31:0]: Number of MKTME KeyIDs.
+ * Bit [63:32]: Number of TDX private KeyIDs.
+ */
+ ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
+ &_nr_tdx_keyids);
+ if (ret || !_nr_tdx_keyids)
+ return -EINVAL;
+
+ /* TDX KeyIDs start after the last MKTME KeyID. */
+ _tdx_keyid_start = _nr_mktme_keyids + 1;
+
+ *tdx_keyid_start = _tdx_keyid_start;
+ *nr_tdx_keyids = _nr_tdx_keyids;
+
+ return 0;
+}
+
+static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct tdx_memblock *tmb;
+
+ /*
+ * This check assumes that the start_pfn<->end_pfn range does not
+ * cross multiple @tdx_memlist entries. A single memory online
+ * event across multiple memblocks (from which @tdx_memlist
+ * entries are derived at the time of module initialization) is
+ * not possible. This is because memory offline/online is done
+ * on granularity of 'struct memory_block', and the hotpluggable
+ * memory region (one memblock) must be multiple of memory_block.
+ */
+ list_for_each_entry(tmb, &tdx_memlist, list) {
+ if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
+ return true;
+ }
+ return false;
+}
+
+static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
+ void *v)
+{
+ struct memory_notify *mn = v;
+
+ if (action != MEM_GOING_ONLINE)
+ return NOTIFY_OK;
+
+ /*
+ * Empty list means TDX isn't enabled. Allow any memory
+ * to go online.
+ */
+ if (list_empty(&tdx_memlist))
+ return NOTIFY_OK;
+
+ /*
+ * The TDX memory configuration is static and can not be
+ * changed. Reject onlining any memory which is outside of
+ * the static configuration whether it supports TDX or not.
+ */
+ if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
+ return NOTIFY_OK;
+
+ return NOTIFY_BAD;
+}
+
+static struct notifier_block tdx_memory_nb = {
+ .notifier_call = tdx_memory_notifier,
+};
+
+static void __init check_tdx_erratum(void)
+{
+ /*
+ * These CPUs have an erratum. A partial write from non-TD
+ * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
+ * private memory poisons that memory, and a subsequent read of
+ * that memory triggers #MC.
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
+ }
+}
+
+void __init tdx_init(void)
+{
+ u32 tdx_keyid_start, nr_tdx_keyids;
+ int err;
+
+ err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
+ if (err)
+ return;
+
+ pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
+ tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
+
+ /*
+ * The TDX module itself requires one 'global KeyID' to protect
+ * its metadata. If there's only one TDX KeyID, there won't be
+ * any left for TDX guests thus there's no point to enable TDX
+ * at all.
+ */
+ if (nr_tdx_keyids < 2) {
+ pr_err("initialization failed: too few private KeyIDs available.\n");
+ return;
+ }
+
+ /*
+ * At this point, hibernation_available() indicates whether or
+ * not hibernation support has been permanently disabled.
+ */
+ if (hibernation_available()) {
+ pr_err("initialization failed: Hibernation support is enabled\n");
+ return;
+ }
+
+ err = register_memory_notifier(&tdx_memory_nb);
+ if (err) {
+ pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
+ err);
+ return;
+ }
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
+ pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
+ acpi_suspend_lowlevel = NULL;
+#endif
+
+ /*
+ * Just use the first TDX KeyID as the 'global KeyID' and
+ * leave the rest for TDX guests.
+ */
+ tdx_global_keyid = tdx_keyid_start;
+ tdx_guest_keyid_start = tdx_keyid_start + 1;
+ tdx_nr_guest_keyids = nr_tdx_keyids - 1;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
+
+ check_tdx_erratum();
+}
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
new file mode 100644
index 000000000000..4e3d533cdd61
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_VIRT_TDX_H
+#define _X86_VIRT_TDX_H
+
+#include <linux/bits.h>
+#include "tdx_global_metadata.h"
+
+/*
+ * This file contains both macros and data structures defined by the TDX
+ * architecture and Linux defined software data structures and functions.
+ * The two should not be mixed together for better readability. The
+ * architectural definitions come first.
+ */
+
+/*
+ * TDX module SEAMCALL leaf functions
+ */
+#define TDH_PHYMEM_PAGE_RDMD 24
+#define TDH_SYS_KEY_CONFIG 31
+#define TDH_SYS_INIT 33
+#define TDH_SYS_RD 34
+#define TDH_SYS_LP_INIT 35
+#define TDH_SYS_TDMR_INIT 36
+#define TDH_SYS_CONFIG 45
+
+/* TDX page types */
+#define PT_NDA 0x0
+#define PT_RSVD 0x1
+
+struct tdmr_reserved_area {
+ u64 offset;
+ u64 size;
+} __packed;
+
+#define TDMR_INFO_ALIGNMENT 512
+#define TDMR_INFO_PA_ARRAY_ALIGNMENT 512
+
+struct tdmr_info {
+ u64 base;
+ u64 size;
+ u64 pamt_1g_base;
+ u64 pamt_1g_size;
+ u64 pamt_2m_base;
+ u64 pamt_2m_size;
+ u64 pamt_4k_base;
+ u64 pamt_4k_size;
+ /*
+ * The actual number of reserved areas depends on the value of
+ * field MD_FIELD_ID_MAX_RESERVED_PER_TDMR in the TDX module
+ * global metadata.
+ */
+ DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas);
+} __packed __aligned(TDMR_INFO_ALIGNMENT);
+
+/* Bit definitions of TDX_FEATURES0 metadata field */
+#define TDX_FEATURES0_NO_RBP_MOD BIT(18)
+
+/*
+ * Do not put any hardware-defined TDX structure representations below
+ * this comment!
+ */
+
+/* Kernel defined TDX module status during module initialization. */
+enum tdx_module_status_t {
+ TDX_MODULE_UNINITIALIZED,
+ TDX_MODULE_INITIALIZED,
+ TDX_MODULE_ERROR
+};
+
+struct tdx_memblock {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ int nid;
+};
+
+/* Warn if kernel has less than TDMR_NR_WARN TDMRs after allocation */
+#define TDMR_NR_WARN 4
+
+struct tdmr_info_list {
+ void *tdmrs; /* Flexible array to hold 'tdmr_info's */
+ int nr_consumed_tdmrs; /* How many 'tdmr_info's are in use */
+
+ /* Metadata for finding target 'tdmr_info' and freeing @tdmrs */
+ int tdmr_sz; /* Size of one 'tdmr_info' */
+ int max_tdmrs; /* How many 'tdmr_info's are allocated */
+};
+
+#endif
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
new file mode 100644
index 000000000000..8027a24d1c6e
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Automatically generated functions to read TDX global metadata.
+ *
+ * This file doesn't compile on its own as it lacks of inclusion
+ * of SEAMCALL wrapper primitive which reads global metadata.
+ * Include this file to other C file instead.
+ */
+
+static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features)
+{
+ int ret = 0;
+ u64 val;
+
+ if (!ret && !(ret = read_sys_metadata_field(0x0A00000300000008, &val)))
+ sysinfo_features->tdx_features0 = val;
+
+ return ret;
+}
+
+static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
+{
+ int ret = 0;
+ u64 val;
+
+ if (!ret && !(ret = read_sys_metadata_field(0x9100000100000008, &val)))
+ sysinfo_tdmr->max_tdmrs = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9100000100000009, &val)))
+ sysinfo_tdmr->max_reserved_per_tdmr = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9100000100000010, &val)))
+ sysinfo_tdmr->pamt_4k_entry_size = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9100000100000011, &val)))
+ sysinfo_tdmr->pamt_2m_entry_size = val;
+ if (!ret && !(ret = read_sys_metadata_field(0x9100000100000012, &val)))
+ sysinfo_tdmr->pamt_1g_entry_size = val;
+
+ return ret;
+}
+
+static int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
+{
+ int ret = 0;
+
+ ret = ret ?: get_tdx_sys_info_features(&sysinfo->features);
+ ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr);
+
+ return ret;
+}
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h
new file mode 100644
index 000000000000..6dd3c9695f59
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Automatically generated TDX global metadata structures. */
+#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+
+#include <linux/types.h>
+
+struct tdx_sys_info_features {
+ u64 tdx_features0;
+};
+
+struct tdx_sys_info_tdmr {
+ u16 max_tdmrs;
+ u16 max_reserved_per_tdmr;
+ u16 pamt_4k_entry_size;
+ u16 pamt_2m_entry_size;
+ u16 pamt_1g_entry_size;
+};
+
+struct tdx_sys_info {
+ struct tdx_sys_info_features features;
+ struct tdx_sys_info_tdmr tdmr;
+};
+
+#endif