diff options
Diffstat (limited to 'arch/x86/virt/svm/sev.c')
-rw-r--r-- | arch/x86/virt/svm/sev.c | 1072 |
1 files changed, 1072 insertions, 0 deletions
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c new file mode 100644 index 000000000000..42e74a5a7d78 --- /dev/null +++ b/arch/x86/virt/svm/sev.c @@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD SVM-SEV Host Support. + * + * Copyright (C) 2023 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra <ashish.kalra@amd.com> + * + */ + +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/iommu.h> +#include <linux/amd-iommu.h> +#include <linux/nospec.h> + +#include <asm/sev.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> +#include <asm/iommu.h> + +/* + * The RMP entry information as returned by the RMPREAD instruction. + */ +struct rmpentry { + u64 gpa; + u8 assigned :1, + rsvd1 :7; + u8 pagesize :1, + hpage_region_status :1, + rsvd2 :6; + u8 immutable :1, + rsvd3 :7; + u8 rsvd4; + u32 asid; +} __packed; + +/* + * The raw RMP entry format is not architectural. The format is defined in PPR + * Family 19h Model 01h, Rev B1 processor. This format represents the actual + * entry in the RMP table memory. The bitfield definitions are used for machines + * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo" + * fields are only used for dumping the raw data. + */ +struct rmpentry_raw { + union { + struct { + u64 assigned : 1, + pagesize : 1, + immutable : 1, + rsvd1 : 9, + gpa : 39, + asid : 10, + vmsa : 1, + validated : 1, + rsvd2 : 1; + }; + u64 lo; + }; + u64 hi; +} __packed; + +/* + * The first 16KB from the RMP_BASE is used by the processor for the + * bookkeeping, the range needs to be added during the RMP entry lookup. + */ +#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 + +/* + * For a non-segmented RMP table, use the maximum physical addressing as the + * segment size in order to always arrive at index 0 in the table. + */ +#define RMPTABLE_NON_SEGMENTED_SHIFT 52 + +struct rmp_segment_desc { + struct rmpentry_raw *rmp_entry; + u64 max_index; + u64 size; +}; + +/* + * Segmented RMP Table support. + * - The segment size is used for two purposes: + * - Identify the amount of memory covered by an RMP segment + * - Quickly locate an RMP segment table entry for a physical address + * + * - The RMP segment table contains pointers to an RMP table that covers + * a specific portion of memory. There can be up to 512 8-byte entries, + * one pages worth. + */ +#define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0)) +#define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20)) + +#define RST_SIZE SZ_4K +static struct rmp_segment_desc **rmp_segment_table __ro_after_init; +static unsigned int rst_max_index __ro_after_init = 512; + +static unsigned int rmp_segment_shift; +static u64 rmp_segment_size; +static u64 rmp_segment_mask; + +#define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) +#define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) + +static u64 rmp_cfg; + +/* Mask to apply to a PFN to get the first PFN of a 2MB page */ +#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) + +static u64 probed_rmp_base, probed_rmp_size; + +static LIST_HEAD(snp_leaked_pages_list); +static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); + +static unsigned long snp_nr_leaked_pages; + +#undef pr_fmt +#define pr_fmt(fmt) "SEV-SNP: " fmt + +static int __mfd_enable(unsigned int cpu) +{ + u64 val; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_MFDM; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void mfd_enable(void *arg) +{ + __mfd_enable(smp_processor_id()); +} + +static int __snp_enable(unsigned int cpu) +{ + u64 val; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + rdmsrl(MSR_AMD64_SYSCFG, val); + + val |= MSR_AMD64_SYSCFG_SNP_EN; + val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; + + wrmsrl(MSR_AMD64_SYSCFG, val); + + return 0; +} + +static __init void snp_enable(void *arg) +{ + __snp_enable(smp_processor_id()); +} + +static void __init __snp_fixup_e820_tables(u64 pa) +{ + if (IS_ALIGNED(pa, PMD_SIZE)) + return; + + /* + * Handle cases where the RMP table placement by the BIOS is not + * 2M aligned and the kexec kernel could try to allocate + * from within that chunk which then causes a fatal RMP fault. + * + * The e820_table needs to be updated as it is converted to + * kernel memory resources and used by KEXEC_FILE_LOAD syscall + * to load kexec segments. + * + * The e820_table_firmware needs to be updated as it is exposed + * to sysfs and used by the KEXEC_LOAD syscall to load kexec + * segments. + * + * The e820_table_kexec needs to be updated as it passed to + * the kexec-ed kernel. + */ + pa = ALIGN_DOWN(pa, PMD_SIZE); + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + if (!memblock_is_region_reserved(pa, PMD_SIZE)) + memblock_reserve(pa, PMD_SIZE); + } +} + +static void __init fixup_e820_tables_for_segmented_rmp(void) +{ + u64 pa, *rst, size, mapped_size; + unsigned int i; + + __snp_fixup_e820_tables(probed_rmp_base); + + pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + + __snp_fixup_e820_tables(pa + RST_SIZE); + + rst = early_memremap(pa, RST_SIZE); + if (!rst) + return; + + for (i = 0; i < rst_max_index; i++) { + pa = RST_ENTRY_SEGMENT_BASE(rst[i]); + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + __snp_fixup_e820_tables(pa); + + /* + * Mapped size in GB. Mapped size is allowed to exceed + * the segment coverage size, but gets reduced to the + * segment coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) + mapped_size = rmp_segment_size; + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + size = PHYS_PFN(mapped_size) << 4; + + __snp_fixup_e820_tables(pa + size); + } + + early_memunmap(rst, RST_SIZE); +} + +static void __init fixup_e820_tables_for_contiguous_rmp(void) +{ + __snp_fixup_e820_tables(probed_rmp_base); + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); +} + +void __init snp_fixup_e820_tables(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + fixup_e820_tables_for_segmented_rmp(); + } else { + fixup_e820_tables_for_contiguous_rmp(); + } +} + +static bool __init clear_rmptable_bookkeeping(void) +{ + void *bk; + + bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB); + if (!bk) { + pr_err("Failed to map RMP bookkeeping area\n"); + return false; + } + + memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ); + + memunmap(bk); + + return true; +} + +static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa) +{ + u64 rst_index, rmp_segment_size_max; + struct rmp_segment_desc *desc; + void *rmp_segment; + + /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */ + rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4; + + /* Validate the RMP segment size */ + if (segment_size > rmp_segment_size_max) { + pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n", + segment_size, rmp_segment_size_max); + return false; + } + + /* Validate the RMP segment table index */ + rst_index = RST_ENTRY_INDEX(pa); + if (rst_index >= rst_max_index) { + pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n", + pa, rmp_segment_size); + return false; + } + + if (rmp_segment_table[rst_index]) { + pr_err("RMP segment descriptor already exists at index %llu\n", rst_index); + return false; + } + + rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB); + if (!rmp_segment) { + pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n", + segment_pa, segment_size); + return false; + } + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + memunmap(rmp_segment); + return false; + } + + desc->rmp_entry = rmp_segment; + desc->max_index = segment_size / sizeof(*desc->rmp_entry); + desc->size = segment_size; + + rmp_segment_table[rst_index] = desc; + + return true; +} + +static void __init free_rmp_segment_table(void) +{ + unsigned int i; + + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memunmap(desc->rmp_entry); + + kfree(desc); + } + + free_page((unsigned long)rmp_segment_table); + + rmp_segment_table = NULL; +} + +/* Allocate the table used to index into the RMP segments */ +static bool __init alloc_rmp_segment_table(void) +{ + struct page *page; + + page = alloc_page(__GFP_ZERO); + if (!page) + return false; + + rmp_segment_table = page_address(page); + + return true; +} + +static bool __init setup_contiguous_rmptable(void) +{ + u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end; + + if (!probed_rmp_size) + return false; + + rmp_end = probed_rmp_base + probed_rmp_size - 1; + + /* + * Calculate the amount of memory that must be reserved by the BIOS to + * address the whole RAM, including the bookkeeping area. The RMP itself + * must also be covered. + */ + max_rmp_pfn = max_pfn; + if (PFN_UP(rmp_end) > max_pfn) + max_rmp_pfn = PFN_UP(rmp_end); + + calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; + if (calc_rmp_sz > probed_rmp_size) { + pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + calc_rmp_sz, probed_rmp_size); + return false; + } + + if (!alloc_rmp_segment_table()) + return false; + + /* Map only the RMP entries */ + rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; + + if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { + free_rmp_segment_table(); + return false; + } + + return true; +} + +static bool __init setup_segmented_rmptable(void) +{ + u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max; + unsigned int i, max_index; + + if (!probed_rmp_base) + return false; + + if (!alloc_rmp_segment_table()) + return false; + + rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB); + if (!rst) { + pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa); + goto e_free; + } + + pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30); + + ram_pa_max = max_pfn << PAGE_SHIFT; + + max_index = 0; + ram_pa_end = 0; + for (i = 0; i < rst_max_index; i++) { + u64 rmp_segment, rmp_size, mapped_size; + + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + max_index = i; + + /* + * Mapped size in GB. Mapped size is allowed to exceed the + * segment coverage size, but gets reduced to the segment + * coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) { + pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n", + i, mapped_size, rmp_segment_size); + mapped_size = rmp_segment_size; + } + + rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]); + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + rmp_size = PHYS_PFN(mapped_size) << 4; + + pa = (u64)i << rmp_segment_shift; + + /* + * Some segments may be for MMIO mapped above system RAM. These + * segments are used for Trusted I/O. + */ + if (pa < ram_pa_max) + ram_pa_end = pa + mapped_size; + + if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa)) + goto e_unmap; + + pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n", + i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1); + } + + if (ram_pa_max > ram_pa_end) { + pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + ram_pa_max, ram_pa_end); + goto e_unmap; + } + + /* Adjust the maximum index based on the found segments */ + rst_max_index = max_index + 1; + + memunmap(rst); + + return true; + +e_unmap: + memunmap(rst); + +e_free: + free_rmp_segment_table(); + + return false; +} + +static bool __init setup_rmptable(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + return setup_segmented_rmptable(); + } else { + return setup_contiguous_rmptable(); + } +} + +/* + * Do the necessary preparations which are verified by the firmware as + * described in the SNP_INIT_EX firmware command description in the SNP + * firmware ABI spec. + */ +int __init snp_rmptable_init(void) +{ + unsigned int i; + u64 val; + + if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) + return -ENOSYS; + + if (WARN_ON_ONCE(!amd_iommu_snp_en)) + return -ENOSYS; + + if (!setup_rmptable()) + return -ENOSYS; + + /* + * Check if SEV-SNP is already enabled, this can happen in case of + * kexec boot. + */ + rdmsrl(MSR_AMD64_SYSCFG, val); + if (val & MSR_AMD64_SYSCFG_SNP_EN) + goto skip_enable; + + /* Zero out the RMP bookkeeping area */ + if (!clear_rmptable_bookkeeping()) { + free_rmp_segment_table(); + return -ENOSYS; + } + + /* Zero out the RMP entries */ + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memset(desc->rmp_entry, 0, desc->size); + } + + /* Flush the caches to ensure that data is written before SNP is enabled. */ + wbinvd_on_all_cpus(); + + /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */ + on_each_cpu(mfd_enable, NULL, 1); + + on_each_cpu(snp_enable, NULL, 1); + +skip_enable: + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); + + /* + * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic + * notifier is invoked to do SNP IOMMU shutdown before kdump. + */ + crash_kexec_post_notifiers = true; + + return 0; +} + +static void set_rmp_segment_info(unsigned int segment_shift) +{ + rmp_segment_shift = segment_shift; + rmp_segment_size = 1ULL << rmp_segment_shift; + rmp_segment_mask = rmp_segment_size - 1; +} + +#define RMP_ADDR_MASK GENMASK_ULL(51, 13) + +static bool probe_contiguous_rmptable_info(void) +{ + u64 rmp_sz, rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + + if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + if (rmp_base > rmp_end) { + pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); + return false; + } + + rmp_sz = rmp_end - rmp_base + 1; + + /* Treat the contiguous RMP table as a single segment */ + rst_max_index = 1; + + set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT); + + probed_rmp_base = rmp_base; + probed_rmp_size = rmp_sz; + + pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_end); + + return true; +} + +static bool probe_segmented_rmptable_info(void) +{ + unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; + u64 rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + if (!(rmp_base & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + WARN_ONCE(rmp_end & RMP_ADDR_MASK, + "Segmented RMP enabled but RMP_END MSR is non-zero\n"); + + /* Obtain the min and max supported RMP segment size */ + eax = cpuid_eax(0x80000025); + segment_shift_min = eax & GENMASK(5, 0); + segment_shift_max = (eax & GENMASK(11, 6)) >> 6; + + /* Verify the segment size is within the supported limits */ + segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg); + if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) { + pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n", + segment_shift, segment_shift_min, segment_shift_max); + return false; + } + + /* Override the max supported RST index if a hardware limit exists */ + ebx = cpuid_ebx(0x80000025); + if (ebx & BIT(10)) + rst_max_index = ebx & GENMASK(9, 0); + + set_rmp_segment_info(segment_shift); + + probed_rmp_base = rmp_base; + probed_rmp_size = 0; + + pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE); + + return true; +} + +bool snp_probe_rmptable_info(void) +{ + if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) + rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg); + + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) + return probe_segmented_rmptable_info(); + else + return probe_contiguous_rmptable_info(); +} + +/* + * About the array_index_nospec() usage below: + * + * This function can get called by exported functions like + * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among + * others, and since the @pfn passed in cannot always be trusted, + * speculation should be stopped as a protective measure. + */ +static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) +{ + u64 paddr, rst_index, segment_index; + struct rmp_segment_desc *desc; + + if (!rmp_segment_table) + return ERR_PTR(-ENODEV); + + paddr = pfn << PAGE_SHIFT; + + rst_index = RST_ENTRY_INDEX(paddr); + if (unlikely(rst_index >= rst_max_index)) + return ERR_PTR(-EFAULT); + + rst_index = array_index_nospec(rst_index, rst_max_index); + + desc = rmp_segment_table[rst_index]; + if (unlikely(!desc)) + return ERR_PTR(-EFAULT); + + segment_index = RMP_ENTRY_INDEX(paddr); + if (unlikely(segment_index >= desc->max_index)) + return ERR_PTR(-EFAULT); + + segment_index = array_index_nospec(segment_index, desc->max_index); + + return desc->rmp_entry + segment_index; +} + +static int get_rmpentry(u64 pfn, struct rmpentry *e) +{ + struct rmpentry_raw *e_raw; + + if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) { + int ret; + + /* Binutils version 2.44 supports the RMPREAD mnemonic. */ + asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd" + : "=a" (ret) + : "a" (pfn << PAGE_SHIFT), "c" (e) + : "memory", "cc"); + + return ret; + } + + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) + return PTR_ERR(e_raw); + + /* + * Map the raw RMP table entry onto the RMPREAD output format. + * The 2MB region status indicator (hpage_region_status field) is not + * calculated, since the overhead could be significant and the field + * is not used. + */ + memset(e, 0, sizeof(*e)); + e->gpa = e_raw->gpa << PAGE_SHIFT; + e->asid = e_raw->asid; + e->assigned = e_raw->assigned; + e->pagesize = e_raw->pagesize; + e->immutable = e_raw->immutable; + + return 0; +} + +static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level) +{ + struct rmpentry e_large; + int ret; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + ret = get_rmpentry(pfn, e); + if (ret) + return ret; + + /* + * Find the authoritative RMP entry for a PFN. This can be either a 4K + * RMP entry or a special large RMP entry that is authoritative for a + * whole 2M area. + */ + ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large); + if (ret) + return ret; + + *level = RMP_TO_PG_LEVEL(e_large.pagesize); + + return 0; +} + +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) +{ + struct rmpentry e; + int ret; + + ret = __snp_lookup_rmpentry(pfn, &e, level); + if (ret) + return ret; + + *assigned = !!e.assigned; + return 0; +} +EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); + +/* + * Dump the raw RMP entry for a particular PFN. These bits are documented in the + * PPR for a particular CPU model and provide useful information about how a + * particular PFN is being utilized by the kernel/firmware at the time certain + * unexpected events occur, such as RMP faults. + */ +static void dump_rmpentry(u64 pfn) +{ + struct rmpentry_raw *e_raw; + u64 pfn_i, pfn_end; + struct rmpentry e; + int level, ret; + + ret = __snp_lookup_rmpentry(pfn, &e, &level); + if (ret) { + pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n", + pfn, ret); + return; + } + + if (e.assigned) { + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) { + pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n", + pfn, PTR_ERR(e_raw)); + return; + } + + pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", + pfn, e_raw->lo, e_raw->hi); + return; + } + + /* + * If the RMP entry for a particular PFN is not in an assigned state, + * then it is sometimes useful to get an idea of whether or not any RMP + * entries for other PFNs within the same 2MB region are assigned, since + * those too can affect the ability to access a particular PFN in + * certain situations, such as when the PFN is being accessed via a 2MB + * mapping in the host page table. + */ + pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD); + pfn_end = pfn_i + PTRS_PER_PMD; + + pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n", + pfn, pfn_i, pfn_end); + + while (pfn_i < pfn_end) { + e_raw = get_raw_rmpentry(pfn_i); + if (IS_ERR(e_raw)) { + pr_err("Error %ld reading RMP contents for PFN 0x%llx\n", + PTR_ERR(e_raw), pfn_i); + pfn_i++; + continue; + } + + if (e_raw->lo || e_raw->hi) + pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi); + pfn_i++; + } +} + +void snp_dump_hva_rmpentry(unsigned long hva) +{ + unsigned long paddr; + unsigned int level; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd += pgd_index(hva); + pte = lookup_address_in_pgd(pgd, hva, &level); + + if (!pte) { + pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva); + return; + } + + paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level)); + dump_rmpentry(PHYS_PFN(paddr)); +} + +/* + * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the + * Validated bit. + */ +int psmash(u64 pfn) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + if (!pfn_valid(pfn)) + return -EINVAL; + + /* Binutils version 2.36 supports the PSMASH mnemonic. */ + asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" + : "=a" (ret) + : "a" (paddr) + : "memory", "cc"); + + return ret; +} +EXPORT_SYMBOL_GPL(psmash); + +/* + * If the kernel uses a 2MB or larger directmap mapping to write to an address, + * and that mapping contains any 4KB pages that are set to private in the RMP + * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that + * owns the PFNs being transitioned will never attempt such a write, but other + * kernel tasks writing to other PFNs in the range may trigger these checks + * inadvertently due a large directmap mapping that happens to overlap such a + * PFN. + * + * Prevent this by splitting any 2MB+ mappings that might end up containing a + * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the + * PFN/rmp_level passed in. + * + * Note that there is no attempt here to scan all the RMP entries for the 2MB + * physical range, since it would only be worthwhile in determining if a + * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of + * the same shared/private state, thus avoiding the need to split the mapping. + * But that would mean the entries are currently in a mixed state, and so the + * mapping would have already been split as a result of prior transitions. + * And since the 4K split is only done if the mapping is 2MB+, and there isn't + * currently a mechanism in place to restore 2MB+ mappings, such a check would + * not provide any usable benefit. + * + * More specifics on how these checks are carried out can be found in APM + * Volume 2, "RMP and VMPL Access Checks". + */ +static int adjust_direct_map(u64 pfn, int rmp_level) +{ + unsigned long vaddr; + unsigned int level; + int npages, ret; + pte_t *pte; + + /* + * pfn_to_kaddr() will return a vaddr only within the direct + * map range. + */ + vaddr = (unsigned long)pfn_to_kaddr(pfn); + + /* Only 4KB/2MB RMP entries are supported by current hardware. */ + if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M)) + return -EINVAL; + + if (!pfn_valid(pfn)) + return -EINVAL; + + if (rmp_level == PG_LEVEL_2M && + (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1))) + return -EINVAL; + + /* + * If an entire 2MB physical range is being transitioned, then there is + * no risk of RMP #PFs due to write accesses from overlapping mappings, + * since even accesses from 1GB mappings will be treated as 2MB accesses + * as far as RMP table checks are concerned. + */ + if (rmp_level == PG_LEVEL_2M) + return 0; + + pte = lookup_address(vaddr, &level); + if (!pte || pte_none(*pte)) + return 0; + + if (level == PG_LEVEL_4K) + return 0; + + npages = page_level_size(rmp_level) / PAGE_SIZE; + ret = set_memory_4k(vaddr, npages); + if (ret) + pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n", + pfn, ret); + + return ret; +} + +/* + * It is expected that those operations are seldom enough so that no mutual + * exclusion of updaters is needed and thus the overlap error condition below + * should happen very rarely and would get resolved relatively quickly by + * the firmware. + * + * If not, one could consider introducing a mutex or so here to sync concurrent + * RMP updates and thus diminish the amount of cases where firmware needs to + * lock 2M ranges to protect against concurrent updates. + * + * The optimal solution would be range locking to avoid locking disjoint + * regions unnecessarily but there's no support for that yet. + */ +static int rmpupdate(u64 pfn, struct rmp_state *state) +{ + unsigned long paddr = pfn << PAGE_SHIFT; + int ret, level; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + level = RMP_TO_PG_LEVEL(state->pagesize); + + if (adjust_direct_map(pfn, level)) + return -EFAULT; + + do { + /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" + : "=a" (ret) + : "a" (paddr), "c" ((unsigned long)state) + : "memory", "cc"); + } while (ret == RMPUPDATE_FAIL_OVERLAP); + + if (ret) { + pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n", + pfn, level, ret); + dump_rmpentry(pfn); + dump_stack(); + return -EFAULT; + } + + return 0; +} + +/* Transition a page to guest-owned/private state in the RMP table. */ +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.assigned = 1; + state.asid = asid; + state.immutable = immutable; + state.gpa = gpa; + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_private); + +/* Transition a page to hypervisor-owned/shared state in the RMP table. */ +int rmp_make_shared(u64 pfn, enum pg_level level) +{ + struct rmp_state state; + + memset(&state, 0, sizeof(state)); + state.pagesize = PG_LEVEL_TO_RMP(level); + + return rmpupdate(pfn, &state); +} +EXPORT_SYMBOL_GPL(rmp_make_shared); + +void snp_leak_pages(u64 pfn, unsigned int npages) +{ + struct page *page = pfn_to_page(pfn); + + pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages); + + spin_lock(&snp_leaked_pages_list_lock); + while (npages--) { + + /* + * Reuse the page's buddy list for chaining into the leaked + * pages list. This page should not be on a free list currently + * and is also unsafe to be added to a free list. + */ + if (likely(!PageCompound(page)) || + + /* + * Skip inserting tail pages of compound page as + * page->buddy_list of tail pages is not usable. + */ + (PageHead(page) && compound_nr(page) <= npages)) + list_add_tail(&page->buddy_list, &snp_leaked_pages_list); + + dump_rmpentry(pfn); + snp_nr_leaked_pages++; + pfn++; + page++; + } + spin_unlock(&snp_leaked_pages_list_lock); +} +EXPORT_SYMBOL_GPL(snp_leak_pages); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + wbinvd(); +} |