summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2021-10-16 15:17:46 +0200
committerIngo Molnar <mingo@kernel.org>2021-10-16 15:17:46 +0200
commit082f20b21de20285da2cbfc1be29656f0714c1b8 (patch)
tree6170af99f1491fe256351b9cf92725dbda79c153 /arch/x86/kernel
parent724fc0248d450224b19ef5b5ee41e392348f6704 (diff)
parentb2381acd3fd9bacd2c63f53b2c610c89959b31cc (diff)
Merge branch 'x86/urgent' into x86/fpu, to resolve a conflict
Resolve the conflict between these commits: x86/fpu: 1193f408cd51 ("x86/fpu/signal: Change return type of __fpu_restore_sig() to boolean") x86/urgent: d298b03506d3 ("x86/fpu: Restore the masking out of reserved MXCSR bits") b2381acd3fd9 ("x86/fpu: Mask out the invalid MXCSR bits properly") Conflicts: arch/x86/kernel/fpu/signal.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/mce/core.c43
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c6
-rw-r--r--arch/x86/kernel/early-quirks.c6
-rw-r--r--arch/x86/kernel/fpu/signal.c11
-rw-r--r--arch/x86/kernel/hpet.c81
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/setup.c26
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/sev-shared.c2
10 files changed, 145 insertions, 46 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0f8885949e8c..b3410f1ac217 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -326,6 +326,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_SMAP
cr4_set_bits(X86_CR4_SMAP);
#else
+ clear_cpu_cap(c, X86_FEATURE_SMAP);
cr4_clear_bits(X86_CR4_SMAP);
#endif
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index cd919fce3ca3..2af4b420172e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1235,6 +1235,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
static void kill_me_now(struct callback_head *ch)
{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
force_sig(SIGBUS);
}
@@ -1244,6 +1247,7 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED;
int ret;
+ p->mce_count = 0;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
@@ -1272,17 +1276,34 @@ static void kill_me_maybe(struct callback_head *cb)
}
}
-static void queue_task_work(struct mce *m, int kill_current_task)
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
{
- current->mce_addr = m->addr;
- current->mce_kflags = m->kflags;
- current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
- current->mce_whole_page = whole_page(m);
+ int count = ++current->mce_count;
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+
+ if (kill_current_task)
+ current->mce_kill_me.func = kill_me_now;
+ else
+ current->mce_kill_me.func = kill_me_maybe;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
task_work_add(current, &current->mce_kill_me, TWA_RESUME);
}
@@ -1420,7 +1441,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, kill_current_task);
+ queue_task_work(&m, msg, kill_current_task);
} else {
/*
@@ -1438,7 +1459,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_current_task);
+ queue_task_work(&m, msg, kill_current_task);
}
out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 4b8813bafffd..bb1c3f5f60c8 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(d);
+ kfree(hw_dom);
return;
}
if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(d);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom->mbps_val);
+ kfree(hw_dom);
return;
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 38837dad46e6..391a4e2b8604 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -714,12 +714,6 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3e20,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x3ec4,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
- { PCI_VENDOR_ID_INTEL, 0x8a12,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{ PCI_VENDOR_ID_BROADCOM, 0x4331,
PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 39c7bae97daf..51c4915a35f0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -384,9 +384,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
sizeof(fpu->state.fxsave)))
return false;
- /* Reject invalid MXCSR values. */
- if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
- return false;
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
+ } else {
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+ }
/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
if (use_xsave())
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 42fc41dd0e1f..882213df3713 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/mwait.h>
#undef pr_fmt
#define pr_fmt(fmt) "hpet: " fmt
@@ -916,6 +917,83 @@ static bool __init hpet_counting(void)
return false;
}
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return false;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * forseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
+}
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
@@ -929,6 +1007,9 @@ int __init hpet_enable(void)
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
if (!hpet_virt_address)
return 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ad273e5861c1..73c74b961d0f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -49,18 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
static struct pvclock_wall_clock wall_clock __bss_decrypted;
-static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
static struct pvclock_vsyscall_time_info *hvclock_mem;
-
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
-{
- return &this_cpu_read(hv_clock_per_cpu)->pvti;
-}
-
-static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
-{
- return this_cpu_read(hv_clock_per_cpu);
-}
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
/*
* The wallclock is the time of day when we booted. Since then, some time may
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 79f164141116..40ed44ead063 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
+ /*
+ * Do some memory reservations *before* memory is added to memblock, so
+ * memblock allocations won't overwrite it.
+ *
+ * After this point, everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not RAM in
+ * e820. All other memory is free game.
+ *
+ * This call needs to happen before e820__memory_setup() which calls the
+ * xen_memory_setup() on Xen dom0 which relies on the fact that those
+ * early reservations have happened already.
+ */
+ early_reserve_memory();
+
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
e820__memory_setup();
parse_setup_data();
@@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
- /*
- * Do some memory reservations *before* memory is added to
- * memblock, so memblock allocations won't overwrite it.
- * Do it after early param, so we could get (unlikely) panic from
- * serial.
- *
- * After this point everything still needed from the boot loader or
- * firmware or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
- early_reserve_memory();
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 78a32b956e81..5afd98559193 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
- memblock_free(__pa(ptr), size);
+ memblock_free_ptr(ptr, size);
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 9f90f460a28c..bf1033a62e48 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -130,6 +130,8 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
} else {
ret = ES_VMM_ERROR;
}
+ } else if (ghcb->save.sw_exit_info_1 & 0xffffffff) {
+ ret = ES_VMM_ERROR;
} else {
ret = ES_OK;
}