summaryrefslogtreecommitdiff
path: root/arch/x86/coco/tdx/tdx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/coco/tdx/tdx.c')
-rw-r--r--arch/x86/coco/tdx/tdx.c540
1 files changed, 456 insertions, 84 deletions
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1d6b863c42b0..7b2833705d47 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -7,12 +7,17 @@
#include <linux/cpufeature.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
+#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
/* MMIO direction */
#define EPT_READ 0
@@ -28,17 +33,17 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
-#define ATTR_DEBUG BIT(0)
-#define ATTR_SEPT_VE_DISABLE BIT(28)
-
/* TDX Module call error codes */
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
#define TDCALL_INVALID_OPERAND 0xc0000100
+#define TDCALL_OPERAND_BUSY 0x80000200
#define TDREPORT_SUBTYPE_0 0
+static atomic_long_t nr_shared;
+
/* Called from __tdx_hypercall() for unrecoverable failure */
-noinstr void __tdx_hypercall_failed(void)
+noinstr void __noreturn __tdx_hypercall_failed(void)
{
instrumentation_begin();
panic("TDVMCALL failed. TDX module bug?");
@@ -48,7 +53,7 @@ noinstr void __tdx_hypercall_failed(void)
long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
unsigned long p3, unsigned long p4)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = nr,
.r11 = p1,
.r12 = p2,
@@ -66,13 +71,38 @@ EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
* should only be used for calls that have no legitimate reason to fail
* or where the kernel can not survive the call failing.
*/
-static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
- struct tdx_module_output *out)
+static inline void tdcall(u64 fn, struct tdx_module_args *args)
{
- if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
+ if (__tdcall_ret(fn, args))
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
+/* Read TD-scoped metadata */
+static inline u64 tdg_vm_rd(u64 field, u64 *value)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = __tdcall_ret(TDG_VM_RD, &args);
+ *value = args.r8;
+
+ return ret;
+}
+
+/* Write TD-scoped metadata */
+static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ .r8 = value,
+ .r9 = mask,
+ };
+
+ return __tdcall(TDG_VM_WR, &args);
+}
+
/**
* tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
* subtype 0) using TDG.MR.REPORT TDCALL.
@@ -80,23 +110,29 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
* REPORTDATA to be included into TDREPORT.
* @tdreport: Address of the output buffer to store TDREPORT.
*
- * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
- * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.REPORT TDCALL.
+ *
* It is used in the TDX guest driver module to get the TDREPORT0.
*
- * Return 0 on success, -EINVAL for invalid operands, or -EIO on
- * other TDCALL failures.
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
*/
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(tdreport),
+ .rdx = virt_to_phys(reportdata),
+ .r8 = TDREPORT_SUBTYPE_0,
+ };
u64 ret;
- ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
- virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
- 0, NULL);
+ ret = __tdcall(TDG_MR_REPORT, &args);
if (ret) {
if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
- return -EINVAL;
+ return -ENXIO;
+ else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
return -EIO;
}
@@ -104,9 +140,66 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
}
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+/**
+ * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using
+ * TDG.MR.RTMR.EXTEND TDCALL.
+ * @index: Index of RTMR register to be extended.
+ * @data: Address of the input buffer with RTMR register extend data.
+ *
+ * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.RTMR.EXTEND TDCALL.
+ *
+ * It is used in the TDX guest driver module to allow user to extend the RTMR
+ * registers.
+ *
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
+ */
+int tdx_mcall_extend_rtmr(u8 index, u8 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(data),
+ .rdx = index,
+ };
+ u64 ret;
+
+ ret = __tdcall(TDG_MR_RTMR_EXTEND, &args);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -ENXIO;
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr);
+
+/**
+ * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
+ * hypercall.
+ * @buf: Address of the directly mapped shared kernel buffer which
+ * contains TDREPORT. The same buffer will be used by VMM to
+ * store the generated TD Quote output.
+ * @size: size of the tdquote buffer (4KB-aligned).
+ *
+ * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
+ * v1.0 specification for more information on GetQuote hypercall.
+ * It is used in the TDX guest driver module to get the TD Quote.
+ *
+ * Return 0 on success or error code on failure.
+ */
+u64 tdx_hcall_get_quote(u8 *buf, size_t size)
+{
+ /* Since buf is a shared memory, set the shared (decrypted) bits */
+ return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
+
static void __noreturn tdx_panic(const char *msg)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = TDVMCALL_REPORT_FATAL_ERROR,
.r12 = 0, /* Error code: 0 is Panic */
@@ -115,11 +208,11 @@ static void __noreturn tdx_panic(const char *msg)
/* Define register order according to the GHCI */
struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
- char str[64];
+ char bytes[64] __nonstring;
} message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
- strncpy(message.str, msg, 64);
+ strtomem_pad(message.bytes, msg, '\0');
args.r8 = message.r8;
args.r9 = message.r9;
@@ -139,9 +232,103 @@ static void __noreturn tdx_panic(const char *msg)
__tdx_hypercall(&args);
}
-static void tdx_parse_tdinfo(u64 *cc_mask)
+/*
+ * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure
+ * that no #VE will be delivered for accesses to TD-private memory.
+ *
+ * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
+ * controls if the guest will receive such #VE with TD attribute
+ * TDX_ATTR_SEPT_VE_DISABLE.
+ *
+ * Newer TDX modules allow the guest to control if it wants to receive SEPT
+ * violation #VEs.
+ *
+ * Check if the feature is available and disable SEPT #VE if possible.
+ *
+ * If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
+ * attribute is no longer reliable. It reflects the initial state of the
+ * control for the TD, but it will not be updated if someone (e.g. bootloader)
+ * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
+ * determine if SEPT #VEs are enabled or disabled.
+ */
+static void disable_sept_ve(u64 td_attr)
+{
+ const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
+ bool debug = td_attr & TDX_ATTR_DEBUG;
+ u64 config, controls;
+
+ /* Is this TD allowed to disable SEPT #VE */
+ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
+ if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
+ /* No SEPT #VE controls for the guest: check the attribute */
+ if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
+ return;
+
+ /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
+ if (debug)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ return;
+ }
+
+ /* Check if SEPT #VE has been disabled before us */
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ if (controls & TD_CTLS_PENDING_VE_DISABLE)
+ return;
+
+ /* Keep #VEs enabled for splats in debugging environments */
+ if (debug)
+ return;
+
+ /* Disable SEPT #VEs */
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,
+ TD_CTLS_PENDING_VE_DISABLE);
+}
+
+/*
+ * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and
+ * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs.
+ * In practice, this means that the kernel can only boot with a plain topology.
+ * Any complications will cause problems.
+ *
+ * The ENUM_TOPOLOGY feature allows the VMM to provide topology information.
+ * Enabling the feature eliminates topology-related #VEs: the TDX module
+ * virtualizes accesses to the CPUID leafs and the MSR.
+ *
+ * Enable ENUM_TOPOLOGY if it is available.
+ */
+static void enable_cpu_topology_enumeration(void)
{
- struct tdx_module_output out;
+ u64 configured;
+
+ /* Has the VMM provided a valid topology configuration? */
+ tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured);
+ if (!configured) {
+ pr_err("VMM did not configure X2APIC_IDs properly\n");
+ return;
+ }
+
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
+}
+
+static void reduce_unnecessary_ve(void)
+{
+ u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
+
+ if (err == TDX_SUCCESS)
+ return;
+
+ /*
+ * Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
+ * enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
+ */
+ enable_cpu_topology_enumeration();
+}
+
+static void tdx_setup(u64 *cc_mask)
+{
+ struct tdx_module_args args = {};
unsigned int gpa_width;
u64 td_attr;
@@ -152,7 +339,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
* [TDG.VP.INFO].
*/
- tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+ tdcall(TDG_VP_INFO, &args);
/*
* The highest bit of a guest physical address is the "sharing" bit.
@@ -161,24 +348,17 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* The GPA width that comes out of this call is critical. TDX guests
* can not meaningfully run without it.
*/
- gpa_width = out.rcx & GENMASK(5, 0);
+ gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
- /*
- * The kernel can not handle #VE's when accessing normal kernel
- * memory. Ensure that no #VE will be delivered for accesses to
- * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
- */
- td_attr = out.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
- const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
+ td_attr = args.rdx;
- /* Relax SEPT_VE_DISABLE check for debug TD. */
- if (td_attr & ATTR_DEBUG)
- pr_warn("%s\n", msg);
- else
- tdx_panic(msg);
- }
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+
+ disable_sept_ve(td_attr);
+
+ reduce_unnecessary_ve();
}
/*
@@ -228,7 +408,7 @@ static int ve_instr_len(struct ve_info *ve)
static u64 __cpuidle __halt(const bool irq_disabled)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_HLT),
.r12 = irq_disabled,
@@ -253,13 +433,21 @@ static int handle_halt(struct ve_info *ve)
{
const bool irq_disabled = irqs_disabled();
+ /*
+ * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a
+ * wake event may be consumed before requesting HLT emulation, leaving
+ * the vCPU blocking indefinitely.
+ */
+ if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled"))
+ return -EIO;
+
if (__halt(irq_disabled))
return -EIO;
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@@ -270,9 +458,19 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_READ),
.r12 = regs->cx,
@@ -283,7 +481,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return -EIO;
regs->ax = lower_32_bits(args.r11);
@@ -293,7 +491,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
.r12 = regs->cx,
@@ -313,7 +511,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve)
static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_CPUID),
.r12 = regs->ax,
@@ -337,7 +535,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
* ABI can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return -EIO;
/*
@@ -355,17 +553,17 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
static bool mmio_read(int size, unsigned long addr, unsigned long *val)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
.r12 = size,
.r13 = EPT_READ,
.r14 = addr,
- .r15 = *val,
};
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return false;
+
*val = args.r11;
return true;
}
@@ -405,6 +603,11 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EINVAL;
}
+ if (!fault_in_kernel_space(ve->gla)) {
+ WARN_ONCE(1, "Access to userspace address is not supported");
+ return -EINVAL;
+ }
+
/*
* Reject EPT violation #VEs that split pages.
*
@@ -483,7 +686,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
static bool handle_in(struct pt_regs *regs, int size, int port)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
.r12 = size,
@@ -498,7 +701,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
* in TDX Guest-Host-Communication Interface (GHCI) section titled
* "TDG.VP.VMCALL<Instruction.IO>".
*/
- success = !__tdx_hypercall_ret(&args);
+ success = !__tdx_hypercall(&args);
/* Update part of the register affected by the emulated instruction */
regs->ax &= ~mask;
@@ -577,7 +780,7 @@ __init bool tdx_early_handle_ve(struct pt_regs *regs)
void tdx_get_ve_info(struct ve_info *ve)
{
- struct tdx_module_output out;
+ struct tdx_module_args args = {};
/*
* Called during #VE handling to retrieve the #VE info from the
@@ -594,15 +797,15 @@ void tdx_get_ve_info(struct ve_info *ve)
* Note, the TDX module treats virtual NMIs as inhibited if the #VE
* valid flag is set. It means that NMI=>#VE will not result in a #DF.
*/
- tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
+ tdcall(TDG_VP_VEINFO_GET, &args);
/* Transfer the output parameters */
- ve->exit_reason = out.rcx;
- ve->exit_qual = out.rdx;
- ve->gla = out.r8;
- ve->gpa = out.r9;
- ve->instr_len = lower_32_bits(out.r10);
- ve->instr_info = upper_32_bits(out.r10);
+ ve->exit_reason = args.rcx;
+ ve->exit_qual = args.rdx;
+ ve->gla = args.r8;
+ ve->gpa = args.r9;
+ ve->instr_len = lower_32_bits(args.r10);
+ ve->instr_info = upper_32_bits(args.r10);
}
/*
@@ -703,14 +906,15 @@ static bool tdx_cache_flush_required(void)
}
/*
- * Inform the VMM of the guest's intent for this physical page: shared with
- * the VMM or private to the guest. The VMM is expected to change its mapping
- * of the page in response.
+ * Notify the VMM about page mapping conversion. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface (GHCI),
+ * section "TDG.VP.VMCALL<MapGPA>".
*/
-static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
{
- phys_addr_t start = __pa(vaddr);
- phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+ /* Retrying the hypercall a second time should succeed; use 3 just in case */
+ const int max_retries_per_page = 3;
+ int retry_count = 0;
if (!enc) {
/* Set the shared (decrypted) bits: */
@@ -718,12 +922,51 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
end |= cc_mkdec(0);
}
- /*
- * Notify the VMM about page mapping conversion. More info about ABI
- * can be found in TDX Guest-Host-Communication Interface (GHCI),
- * section "TDG.VP.VMCALL<MapGPA>"
- */
- if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
+ while (retry_count < max_retries_per_page) {
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_MAP_GPA,
+ .r12 = start,
+ .r13 = end - start };
+
+ u64 map_fail_paddr;
+ u64 ret = __tdx_hypercall(&args);
+
+ if (ret != TDVMCALL_STATUS_RETRY)
+ return !ret;
+ /*
+ * The guest must retry the operation for the pages in the
+ * region starting at the GPA specified in R11. R11 comes
+ * from the untrusted VMM. Sanity check it.
+ */
+ map_fail_paddr = args.r11;
+ if (map_fail_paddr < start || map_fail_paddr >= end)
+ return false;
+
+ /* "Consume" a retry without forward progress */
+ if (map_fail_paddr == start) {
+ retry_count++;
+ continue;
+ }
+
+ start = map_fail_paddr;
+ retry_count = 0;
+ }
+
+ return false;
+}
+
+/*
+ * Inform the VMM of the guest's intent for this physical page: shared with
+ * the VMM or private to the guest. The VMM is expected to change its mapping
+ * of the page in response.
+ */
+static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+{
+ phys_addr_t start = __pa(vaddr);
+ phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+
+ if (!tdx_map_gpa(start, end, enc))
return false;
/* shared->private conversion requires memory to be accepted before use */
@@ -733,28 +976,138 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
return true;
}
-static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
- bool enc)
+static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
+ bool enc)
{
/*
* Only handle shared->private conversion here.
* See the comment in tdx_early_init().
*/
- if (enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ return 0;
}
-static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
+static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
bool enc)
{
/*
* Only handle private->shared conversion here.
* See the comment in tdx_early_init().
*/
- if (!enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ if (enc)
+ atomic_long_sub(numpages, &nr_shared);
+ else
+ atomic_long_add(numpages, &nr_shared);
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void tdx_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/* Walk direct mapping and convert all shared memory back to private */
+static void tdx_kexec_finish(void)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ lockdep_assert_irqs_disabled();
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ */
+ set_pte(pte, __pte(0));
+
+ /*
+ * Memory encryption state persists across kexec.
+ * If tdx_enc_status_changed() fails in the first
+ * kernel, it leaves memory in an unknown state.
+ *
+ * If that memory remains shared, accessing it in the
+ * *next* kernel through a private mapping will result
+ * in an unrecoverable guest shutdown.
+ *
+ * The kdump kernel boot is not impacted as it uses
+ * a pre-reserved memory range that is always private.
+ * However, gathering crash information could lead to
+ * a crash if it accesses unconverted memory through
+ * a private mapping which is possible when accessing
+ * that memory through /proc/vmcore, for example.
+ *
+ * In all cases, print error info in order to leave
+ * enough bread crumbs for debugging.
+ */
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ __flush_tlb_all();
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
+}
+
+static __init void tdx_announce(void)
+{
+ struct tdx_module_args args = {};
+ u64 controls;
+
+ pr_info("Guest detected\n");
+
+ tdcall(TDG_VP_INFO, &args);
+ tdx_dump_attributes(args.rdx);
+
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ tdx_dump_td_ctls(controls);
}
void __init tdx_early_init(void)
@@ -769,12 +1122,15 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+ /* TSC is the only reliable clock in TDX guest */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
cc_vendor = CC_VENDOR_INTEL;
- tdx_parse_tdinfo(&cc_mask);
- cc_set_mask(cc_mask);
- /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+ /* Configure the TD */
+ tdx_setup(&cc_mask);
+
+ cc_set_mask(cc_mask);
/*
* All bits above GPA width are reserved and kernel treats shared bit
@@ -809,16 +1165,32 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+
+ /*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
* there.
*
* Intel-TDX has a secure RDMSR hypercall, but that needs to be
- * implemented seperately in the low level startup ASM code.
+ * implemented separately in the low level startup ASM code.
* Until that is in place, disable parallel bringup for TDX.
*/
x86_cpuinit.parallel_bringup = false;
- pr_info("Guest detected\n");
+ tdx_announce();
}