diff options
Diffstat (limited to 'arch/x86')
107 files changed, 1873 insertions, 1412 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67745ceab0db..276b3baf1c50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,6 +69,7 @@ config X86 select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE @@ -81,6 +82,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 @@ -462,8 +464,8 @@ config X86_X2APIC Some Intel systems circa 2022 and later are locked into x2APIC mode and can not fall back to the legacy APIC modes if SGX or TDX are - enabled in the BIOS. They will be unable to boot without enabling - this option. + enabled in the BIOS. They will boot with very reduced functionality + without enabling this option. If you don't know what to do here, say N. @@ -1109,7 +1111,6 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY - select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9860ca5979f8..9e38ffaadb5d 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -83,7 +83,7 @@ cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) - @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')' + @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')' OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index e476bcbd9b42..454757fbdfe5 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -668,7 +668,7 @@ static bool process_mem_region(struct mem_vector *region, } } #endif - return 0; + return false; } #ifdef CONFIG_EFI diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 8a3fff9128bb..1c8541ae3b3a 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -350,7 +350,7 @@ static int _kstrtoul(const char *s, unsigned int base, unsigned long *res) } /** - * kstrtoul - convert a string to an unsigned long + * boot_kstrtoul - convert a string to an unsigned long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index b8998cf0508a..cfd4c95b9f04 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -5,6 +5,8 @@ #define pr_fmt(fmt) "tdx: " fmt #include <linux/cpufeature.h> +#include <linux/export.h> +#include <linux/io.h> #include <asm/coco.h> #include <asm/tdx.h> #include <asm/vmx.h> @@ -15,6 +17,7 @@ /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 #define TDX_GET_VEINFO 3 +#define TDX_GET_REPORT 4 #define TDX_ACCEPT_PAGE 6 /* TDX hypercall Leaf IDs */ @@ -36,6 +39,12 @@ #define ATTR_SEPT_VE_DISABLE BIT(28) +/* TDX Module call error codes */ +#define TDCALL_RETURN_CODE(a) ((a) >> 32) +#define TDCALL_INVALID_OPERAND 0xc0000100 + +#define TDREPORT_SUBTYPE_0 0 + /* * Wrapper for standard use of __tdx_hypercall with no output aside from * return code. @@ -100,6 +109,37 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } +/** + * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT + * subtype 0) using TDG.MR.REPORT TDCALL. + * @reportdata: Address of the input buffer which contains user-defined + * REPORTDATA to be included into TDREPORT. + * @tdreport: Address of the output buffer to store TDREPORT. + * + * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module + * v1.0 specification for more information on TDG.MR.REPORT TDCALL. + * It is used in the TDX guest driver module to get the TDREPORT0. + * + * Return 0 on success, -EINVAL for invalid operands, or -EIO on + * other TDCALL failures. + */ +int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) +{ + u64 ret; + + ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport), + virt_to_phys(reportdata), TDREPORT_SUBTYPE_0, + 0, NULL); + if (ret) { + if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) + return -EINVAL; + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); + static void tdx_parse_tdinfo(u64 *cc_mask) { struct tdx_module_output out; diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4dd19819053a..59b93901660d 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -61,7 +61,7 @@ SYM_CODE_START(entry_SYSENTER_compat) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ + pushq $__USER_DS /* pt_regs->ss */ pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* @@ -197,7 +197,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ + pushq $__USER_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 4bf48462fca7..e8c60ae7a7c8 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -27,7 +27,9 @@ VERSION { __vdso_time; clock_getres; __vdso_clock_getres; +#ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; +#endif local: *; }; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index d45c5fcfeac2..b8f3f9b9e53c 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -98,24 +98,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, } #ifdef CONFIG_TIME_NS -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - /* * The vvar page layout depends on whether a task belongs to the root or * non-root time namespace. Whenever a task changes its namespace, the VVAR @@ -140,11 +122,6 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) return 0; } -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} #endif static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, @@ -210,11 +187,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { - struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + pfn = hv_get_tsc_pfn(); - if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) - return vmf_insert_pfn(vma, vmf->address, - virt_to_phys(tsc_pg) >> PAGE_SHIFT); + if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) + return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_timens_page) { struct page *timens_page = find_timens_vvar_page(vma); diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index f1bff153d945..58461fa18b6f 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void) * On ctxswin, sched_in = true, called after the PMU has started * On ctxswout, sched_in = false, called before the PMU is stopped */ -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 8b70237c33f7..d6f3703e4119 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -861,8 +861,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* stop everything (includes BRS) */ - amd_pmu_disable_all(); + amd_brs_disable_all(); /* Drain BRS is in use (could be inactive) */ if (cpuc->lbr_users) @@ -873,7 +872,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) cpuc->enabled = pmu_enabled; if (pmu_enabled) - amd_pmu_enable_all(0); + amd_brs_enable_all(); return amd_pmu_adjust_nmi_window(handled); } diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4cb710efbdd9..da3f5ebac4e1 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = { static struct perf_ibs perf_ibs_fetch = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, @@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = { static struct perf_ibs perf_ibs_op = { .pmu = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = perf_ibs_init, .add = perf_ibs_add, diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 38a75216c12c..eb31f850841a 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = reg->reg; } - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) amd_pmu_lbr_reset(); @@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index d568afc705d2..83f15fe411b3 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -553,6 +553,7 @@ static void uncore_clean_online(void) hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { hlist_del(&uncore->node); + kfree(uncore->events); kfree(uncore); } } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b30b8bbcd1e2..85a63a41c471 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); +DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); + static_call_update(x86_pmu_filter, x86_pmu.filter); } static void _x86_pmu_read(struct perf_event *event) @@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, pr_info("... event mask: %016Lx\n", intel_ctrl); } -/* - * The generic code is not hybrid friendly. The hybrid_pmu->pmu - * of the first registered PMU is unconditionally assigned to - * each possible cpuctx->ctx.pmu. - * Update the correct hybrid PMU to the cpuctx->ctx.pmu. - */ -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) -{ - struct perf_cpu_context *cpuctx; - - if (!pmu->pmu_cpu_context) - return; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->ctx.pmu = pmu; -} - static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2175,13 +2161,9 @@ static int __init init_hw_perf_events(void) if (err) goto out2; } else { - u8 cpu_type = get_this_hybrid_cpu_type(); struct x86_hybrid_pmu *hybrid_pmu; int i, j; - if (!cpu_type && x86_pmu.get_hybrid_cpu_type) - cpu_type = x86_pmu.get_hybrid_cpu_type(); - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; @@ -2195,9 +2177,6 @@ static int __init init_hw_perf_events(void) (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; - - if (cpu_type == hybrid_pmu->cpu_type) - x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id()); } if (i < x86_pmu.num_hybrid_pmus) { @@ -2646,15 +2625,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - static_call_cond(x86_pmu_sched_task)(ctx, sched_in); + static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); } -static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - static_call_cond(x86_pmu_swap_task_ctx)(prev, next); + static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); } void perf_check_microcode(void) @@ -2689,12 +2668,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } -static int x86_pmu_filter_match(struct perf_event *event) +static bool x86_pmu_filter(struct pmu *pmu, int cpu) { - if (x86_pmu.filter_match) - return x86_pmu.filter_match(event); + bool ret = false; - return 1; + static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); + + return ret; } static struct pmu pmu = { @@ -2725,7 +2705,7 @@ static struct pmu pmu = { .aux_output_match = x86_pmu_aux_output_match, - .filter_match = x86_pmu_filter_match, + .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1b92bf05fd65..dfd2c124cdf8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4536,8 +4536,6 @@ end: cpumask_set_cpu(cpu, &pmu->supported_cpus); cpuc->pmu = &pmu->pmu; - x86_pmu_update_cpu_context(&pmu->pmu, cpu); - return true; } @@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu) cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus); } -static void intel_pmu_sched_task(struct perf_event_context *ctx, +static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { - intel_pmu_pebs_sched_task(ctx, sched_in); - intel_pmu_lbr_sched_task(ctx, sched_in); + intel_pmu_pebs_sched_task(pmu_ctx, sched_in); + intel_pmu_lbr_sched_task(pmu_ctx, sched_in); } -static void intel_pmu_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { - intel_pmu_lbr_swap_task_ctx(prev, next); + intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc); } static int intel_pmu_check_period(struct perf_event *event, u64 value) @@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event) return is_intel_pt_event(event); } -static int intel_pmu_filter_match(struct perf_event *event) +static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret) { - struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); - unsigned int cpu = smp_processor_id(); + struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu); - return cpumask_test_cpu(cpu, &pmu->supported_cpus); + *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); @@ -6413,7 +6410,7 @@ __init int intel_pmu_init(void) static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); - x86_pmu.filter_match = intel_pmu_filter_match; + x86_pmu.filter = intel_pmu_filter; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.limit_period = spr_limit_period; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 446d2833efa7..88e58b6ee73c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1069,7 +1069,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1177,7 +1177,7 @@ static void pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct perf_event *event, bool add) { - struct pmu *pmu = event->ctx->pmu; + struct pmu *pmu = event->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8259d725054d..017baba56b01 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx) cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next) +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc) { void *prev_ctx_data, *next_ctx_data; - swap(prev->task_ctx_data, next->task_ctx_data); + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); /* - * Architecture specific synchronization makes sense in - * case both prev->task_ctx_data and next->task_ctx_data + * Architecture specific synchronization makes sense in case + * both prev_epc->task_ctx_data and next_epc->task_ctx_data * pointers are allocated. */ - prev_ctx_data = next->task_ctx_data; - next_ctx_data = prev->task_ctx_data; + prev_ctx_data = next_epc->task_ctx_data; + next_ctx_data = prev_epc->task_ctx_data; if (!prev_ctx_data || !next_ctx_data) return; @@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, task_context_opt(next_ctx_data)->lbr_callstack_users); } -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); void *task_ctx; @@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) * the task was scheduled out, restore the stack. Otherwise flush * the LBR stack. */ - task_ctx = ctx ? ctx->task_ctx_data : NULL; + task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; if (task_ctx) { if (sched_in) __intel_pmu_lbr_restore(task_ctx); @@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; + if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event) */ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users++; - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); } @@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) - task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; + event->pmu_ctx->task_ctx_data) + task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; @@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } static inline bool vlbr_exclude_host(void) diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 03bbcc2fa2ff..35936188db01 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ + signed char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 82ef87e9a897..42a55794004a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1263,6 +1263,15 @@ static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) if (1 << order != nr_pages) goto out; + /* + * Some processors cannot always support single range for more than + * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might + * also be affected, so for now rather than trying to keep track of + * which ones, just disable it for all. + */ + if (nr_pages > 1) + goto out; + buf->single = true; buf->nr_pages = nr_pages; ret = 0; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 2adeaf4de4df..e278e2e7c051 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -2,6 +2,7 @@ #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> +#include <asm/intel-family.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/perf_event.h> @@ -88,12 +89,12 @@ struct intel_uncore_type { * to identify which platform component each PMON block of that type is * supposed to monitor. */ - struct intel_uncore_topology *topology; + struct intel_uncore_topology **topology; /* * Optional callbacks for managing mapping of Uncore units to PMONs */ int (*get_topology)(struct intel_uncore_type *type); - int (*set_mapping)(struct intel_uncore_type *type); + void (*set_mapping)(struct intel_uncore_type *type); void (*cleanup_mapping)(struct intel_uncore_type *type); }; @@ -178,11 +179,26 @@ struct freerunning_counters { unsigned *box_offsets; }; -struct intel_uncore_topology { - u64 configuration; +struct uncore_iio_topology { + int pci_bus_no; int segment; }; +struct uncore_upi_topology { + int die_to; + int pmu_idx_to; + int enabled; +}; + +struct intel_uncore_topology { + int pmu_idx; + union { + void *untyped; + struct uncore_iio_topology *iio; + struct uncore_upi_topology *upi; + }; +}; + struct pci2phy_map { struct list_head list; int segment; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 1ef4f7861e2e..1f4869227efb 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1338,6 +1338,7 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box, /* MCHBAR is disabled */ if (!(mch_bar & BIT(0))) { pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n"); + pci_dev_put(pdev); return; } mch_bar &= ~BIT(0); @@ -1352,6 +1353,8 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box, box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + + pci_dev_put(pdev); } static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ed869443efb2..44c2f879f708 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -445,6 +445,7 @@ #define ICX_UPI_PCI_PMON_CTR0 0x320 #define ICX_UPI_PCI_PMON_BOX_CTL 0x318 #define ICX_UPI_CTL_UMASK_EXT 0xffffff +#define ICX_UBOX_DID 0x3450 /* ICX M3UPI*/ #define ICX_M3UPI_PCI_PMON_CTL0 0xd8 @@ -457,6 +458,7 @@ /* SPR */ #define SPR_RAW_EVENT_MASK_EXT 0xffffff +#define SPR_UBOX_DID 0x3250 /* SPR CHA */ #define SPR_CHA_PMON_CTL_TID_EN (1 << 16) @@ -1372,6 +1374,28 @@ static struct pci_driver snbep_uncore_pci_driver = { #define NODE_ID_MASK 0x7 +/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */ +#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7) + +static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc, + int *nodeid, int *groupid) +{ + int ret; + + /* get the Node ID of the local register */ + ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid); + if (ret) + goto err; + + *nodeid = *nodeid & NODE_ID_MASK; + /* get the Node ID mapping */ + ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid); + if (ret) + goto err; +err: + return ret; +} + /* * build pci bus to socket mapping */ @@ -1397,13 +1421,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool * the topology. */ if (nr_node_ids <= 8) { - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); - if (err) - break; - nodeid = config & NODE_ID_MASK; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, idmap_loc, &config); + err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc, + &nodeid, &config); if (err) break; @@ -1421,7 +1440,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool * to a particular node. */ for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { + if (nodeid == GIDNIDMAP(config, i)) { if (topology_max_die_per_package() > 1) die_id = i; else @@ -2891,6 +2910,7 @@ static bool hswep_has_limit_sbox(unsigned int device) return false; pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + pci_dev_put(dev); if (!hswep_get_chop(capid4)) return true; @@ -3699,10 +3719,16 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; -static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die) { - return pmu->type->topology[die].configuration >> - (pmu->pmu_idx * BUS_NUM_STRIDE); + int idx; + + for (idx = 0; idx < pmu->type->num_boxes; idx++) { + if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx) + return &pmu->type->topology[die][idx]; + } + + return NULL; } static umode_t @@ -3710,8 +3736,9 @@ pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die, int zero_bus_pmu) { struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + struct intel_uncore_topology *pmut = pmu_topology(pmu, die); - return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; + return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; } static umode_t @@ -3727,9 +3754,10 @@ static ssize_t skx_iio_mapping_show(struct device *dev, struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); long die = (long)ea->var; + struct intel_uncore_topology *pmut = pmu_topology(pmu, die); - return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment, - skx_iio_stack(pmu, die)); + return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0, + pmut ? pmut->iio->pci_bus_no : 0); } static int skx_msr_cpu_bus_read(int cpu, u64 *topology) @@ -3764,18 +3792,79 @@ static int die_to_cpu(int die) return res; } -static int skx_iio_get_topology(struct intel_uncore_type *type) +enum { + IIO_TOPOLOGY_TYPE, + UPI_TOPOLOGY_TYPE, + TOPOLOGY_MAX +}; + +static const size_t topology_size[TOPOLOGY_MAX] = { + sizeof(*((struct intel_uncore_topology *)NULL)->iio), + sizeof(*((struct intel_uncore_topology *)NULL)->upi) +}; + +static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type) { - int die, ret = -EPERM; + int die, idx; + struct intel_uncore_topology **topology; + + if (!type->num_boxes) + return -EPERM; - type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), - GFP_KERNEL); - if (!type->topology) - return -ENOMEM; + topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL); + if (!topology) + goto err; for (die = 0; die < uncore_max_dies(); die++) { - ret = skx_msr_cpu_bus_read(die_to_cpu(die), - &type->topology[die].configuration); + topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL); + if (!topology[die]) + goto clear; + for (idx = 0; idx < type->num_boxes; idx++) { + topology[die][idx].untyped = kcalloc(type->num_boxes, + topology_size[topology_type], + GFP_KERNEL); + if (!topology[die][idx].untyped) + goto clear; + } + } + + type->topology = topology; + + return 0; +clear: + for (; die >= 0; die--) { + for (idx = 0; idx < type->num_boxes; idx++) + kfree(topology[die][idx].untyped); + kfree(topology[die]); + } + kfree(topology); +err: + return -ENOMEM; +} + +static void pmu_free_topology(struct intel_uncore_type *type) +{ + int die, idx; + + if (type->topology) { + for (die = 0; die < uncore_max_dies(); die++) { + for (idx = 0; idx < type->num_boxes; idx++) + kfree(type->topology[die][idx].untyped); + kfree(type->topology[die]); + } + kfree(type->topology); + type->topology = NULL; + } +} + +static int skx_pmu_get_topology(struct intel_uncore_type *type, + int (*topology_cb)(struct intel_uncore_type*, int, int, u64)) +{ + int die, ret = -EPERM; + u64 cpu_bus_msr; + + for (die = 0; die < uncore_max_dies(); die++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr); if (ret) break; @@ -3783,15 +3872,33 @@ static int skx_iio_get_topology(struct intel_uncore_type *type) if (ret < 0) break; - type->topology[die].segment = ret; + ret = topology_cb(type, ret, die, cpu_bus_msr); + if (ret) + break; } - if (ret < 0) { - kfree(type->topology); - type->topology = NULL; + return ret; +} + +static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment, + int die, u64 cpu_bus_msr) +{ + int idx; + struct intel_uncore_topology *t; + + for (idx = 0; idx < type->num_boxes; idx++) { + t = &type->topology[die][idx]; + t->pmu_idx = idx; + t->iio->segment = segment; + t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff; } - return ret; + return 0; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + return skx_pmu_get_topology(type, skx_iio_topology_cb); } static struct attribute_group skx_iio_mapping_group = { @@ -3803,8 +3910,25 @@ static const struct attribute_group *skx_iio_attr_update[] = { NULL, }; -static int -pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +static void pmu_clear_mapping_attr(const struct attribute_group **groups, + struct attribute_group *ag) +{ + int i; + + for (i = 0; groups[i]; i++) { + if (groups[i] == ag) { + for (i++; groups[i]; i++) + groups[i - 1] = groups[i]; + groups[i - 1] = NULL; + break; + } + } +} + +static void +pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag, + ssize_t (*show)(struct device*, struct device_attribute*, char*), + int topology_type) { char buf[64]; int ret; @@ -3812,11 +3936,13 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) struct attribute **attrs = NULL; struct dev_ext_attribute *eas = NULL; - ret = type->get_topology(type); + ret = pmu_alloc_topology(type, topology_type); if (ret < 0) goto clear_attr_update; - ret = -ENOMEM; + ret = type->get_topology(type); + if (ret < 0) + goto clear_topology; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3828,20 +3954,20 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) goto clear_attrs; for (die = 0; die < uncore_max_dies(); die++) { - sprintf(buf, "die%ld", die); + snprintf(buf, sizeof(buf), "die%ld", die); sysfs_attr_init(&eas[die].attr.attr); eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); if (!eas[die].attr.attr.name) goto err; eas[die].attr.attr.mode = 0444; - eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.show = show; eas[die].attr.store = NULL; eas[die].var = (void *)die; attrs[die] = &eas[die].attr.attr; } ag->attrs = attrs; - return 0; + return; err: for (; die >= 0; die--) kfree(eas[die].attr.attr.name); @@ -3849,14 +3975,13 @@ err: clear_attrs: kfree(attrs); clear_topology: - kfree(type->topology); + pmu_free_topology(type); clear_attr_update: - type->attr_update = NULL; - return ret; + pmu_clear_mapping_attr(type->attr_update, ag); } static void -pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) { struct attribute **attr = ag->attrs; @@ -3868,17 +3993,23 @@ pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group * kfree(attr_to_ext_attr(*ag->attrs)); kfree(ag->attrs); ag->attrs = NULL; - kfree(type->topology); + pmu_free_topology(type); +} + +static void +pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE); } -static int skx_iio_set_mapping(struct intel_uncore_type *type) +static void skx_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &skx_iio_mapping_group); + pmu_iio_set_mapping(type, &skx_iio_mapping_group); } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group); + pmu_cleanup_mapping(type, &skx_iio_mapping_group); } static struct intel_uncore_type skx_uncore_iio = { @@ -4139,6 +4270,132 @@ static struct intel_uncore_ops skx_upi_uncore_pci_ops = { .read_counter = snbep_uncore_pci_read_counter, }; +static umode_t +skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0; +} + +static ssize_t skx_upi_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi; + + return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to); +} + +#define SKX_UPI_REG_DID 0x2058 +#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e +#define SKX_UPI_REGS_ADDR_FUNCTION 0x00 + +/* + * UPI Link Parameter 0 + * | Bit | Default | Description + * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket. + * | 12:8 | 00h | sending_port - The processor die port number of the sending port. + */ +#define SKX_KTILP0_OFFSET 0x94 + +/* + * UPI Pcode Status. This register is used by PCode to store the link training status. + * | Bit | Default | Description + * | 4 | 0h | ll_status_valid — Bit indicates the valid training status + * logged from PCode to the BIOS. + */ +#define SKX_KTIPCSTS_OFFSET 0x120 + +static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp, + int pmu_idx) +{ + int ret; + u32 upi_conf; + struct uncore_upi_topology *upi = tp->upi; + + tp->pmu_idx = pmu_idx; + ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf); + if (ret) { + ret = pcibios_err_to_errno(ret); + goto err; + } + upi->enabled = (upi_conf >> 4) & 1; + if (upi->enabled) { + ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET, + &upi_conf); + if (ret) { + ret = pcibios_err_to_errno(ret); + goto err; + } + upi->die_to = (upi_conf >> 16) & 0xf; + upi->pmu_idx_to = (upi_conf >> 8) & 0x1f; + } +err: + return ret; +} + +static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment, + int die, u64 cpu_bus_msr) +{ + int idx, ret; + struct intel_uncore_topology *upi; + unsigned int devfn; + struct pci_dev *dev = NULL; + u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE); + + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[die][idx]; + devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx, + SKX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(segment, bus, devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + break; + } + } + + pci_dev_put(dev); + return ret; +} + +static int skx_upi_get_topology(struct intel_uncore_type *type) +{ + /* CPX case is not supported */ + if (boot_cpu_data.x86_stepping == 11) + return -EPERM; + + return skx_pmu_get_topology(type, skx_upi_topology_cb); +} + +static struct attribute_group skx_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *skx_upi_attr_update[] = { + &skx_upi_mapping_group, + NULL +}; + +static void +pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE); +} + +static void skx_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &skx_upi_mapping_group); +} + +static void skx_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &skx_upi_mapping_group); +} + static struct intel_uncore_type skx_uncore_upi = { .name = "upi", .num_counters = 4, @@ -4151,6 +4408,10 @@ static struct intel_uncore_type skx_uncore_upi = { .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &skx_upi_uncore_format_group, + .attr_update = skx_upi_attr_update, + .get_topology = skx_upi_get_topology, + .set_mapping = skx_upi_set_mapping, + .cleanup_mapping = skx_upi_cleanup_mapping, }; static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box) @@ -4461,11 +4722,6 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map int die, stack_id, ret = -EPERM; struct pci_dev *dev = NULL; - type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), - GFP_KERNEL); - if (!type->topology) - return -ENOMEM; - while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) { ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg); if (ret) { @@ -4483,14 +4739,12 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map /* Convert stack id from SAD_CONTROL to PMON notation. */ stack_id = sad_pmon_mapping[stack_id]; - ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number; - type->topology[die].segment = pci_domain_nr(dev->bus); + type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus); + type->topology[die][stack_id].pmu_idx = stack_id; + type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number; } - if (ret) { - kfree(type->topology); - type->topology = NULL; - } + pci_dev_put(dev); return ret; } @@ -4519,14 +4773,14 @@ static int snr_iio_get_topology(struct intel_uncore_type *type) return sad_cfg_iio_topology(type, snr_sad_pmon_mapping); } -static int snr_iio_set_mapping(struct intel_uncore_type *type) +static void snr_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &snr_iio_mapping_group); + pmu_iio_set_mapping(type, &snr_iio_mapping_group); } static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); + pmu_cleanup_mapping(type, &snr_iio_mapping_group); } static struct event_constraint snr_uncore_iio_constraints[] = { @@ -4857,6 +5111,8 @@ static int snr_uncore_mmio_map(struct intel_uncore_box *box, addr += box_ctl; + pci_dev_put(pdev); + box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) { pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); @@ -5137,14 +5393,19 @@ static int icx_iio_get_topology(struct intel_uncore_type *type) return sad_cfg_iio_topology(type, icx_sad_pmon_mapping); } -static int icx_iio_set_mapping(struct intel_uncore_type *type) +static void icx_iio_set_mapping(struct intel_uncore_type *type) { - return pmu_iio_set_mapping(type, &icx_iio_mapping_group); + /* Detect ICX-D system. This case is not supported */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) { + pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group); + return; + } + pmu_iio_set_mapping(type, &icx_iio_mapping_group); } static void icx_iio_cleanup_mapping(struct intel_uncore_type *type) { - pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group); + pmu_cleanup_mapping(type, &icx_iio_mapping_group); } static struct intel_uncore_type icx_uncore_iio = { @@ -5337,6 +5598,76 @@ static const struct attribute_group icx_upi_uncore_format_group = { .attrs = icx_upi_uncore_formats_attr, }; +#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02 +#define ICX_UPI_REGS_ADDR_FUNCTION 0x01 + +static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0) +{ + struct pci_dev *ubox = NULL; + struct pci_dev *dev = NULL; + u32 nid, gid; + int i, idx, ret = -EPERM; + struct intel_uncore_topology *upi; + unsigned int devfn; + + /* GIDNIDMAP method supports machines which have less than 8 sockets. */ + if (uncore_max_dies() > 8) + goto err; + + while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) { + ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid); + if (ret) { + ret = pcibios_err_to_errno(ret); + break; + } + + for (i = 0; i < 8; i++) { + if (nid != GIDNIDMAP(gid, i)) + continue; + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[nid][idx]; + devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), + ubox->bus->number, + devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + goto err; + } + } + } + } +err: + pci_dev_put(ubox); + pci_dev_put(dev); + return ret; +} + +static int icx_upi_get_topology(struct intel_uncore_type *type) +{ + return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0); +} + +static struct attribute_group icx_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *icx_upi_attr_update[] = { + &icx_upi_mapping_group, + NULL +}; + +static void icx_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &icx_upi_mapping_group); +} + +static void icx_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &icx_upi_mapping_group); +} + static struct intel_uncore_type icx_uncore_upi = { .name = "upi", .num_counters = 4, @@ -5349,6 +5680,10 @@ static struct intel_uncore_type icx_uncore_upi = { .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &icx_upi_uncore_format_group, + .attr_update = icx_upi_attr_update, + .get_topology = icx_upi_get_topology, + .set_mapping = icx_upi_set_mapping, + .cleanup_mapping = icx_upi_cleanup_mapping, }; static struct event_constraint icx_uncore_m3upi_constraints[] = { @@ -5780,9 +6115,43 @@ static struct intel_uncore_type spr_uncore_m2m = { .name = "m2m", }; +static struct attribute_group spr_upi_mapping_group = { + .is_visible = skx_upi_mapping_visible, +}; + +static const struct attribute_group *spr_upi_attr_update[] = { + &uncore_alias_group, + &spr_upi_mapping_group, + NULL +}; + +#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01 + +static void spr_upi_set_mapping(struct intel_uncore_type *type) +{ + pmu_upi_set_mapping(type, &spr_upi_mapping_group); +} + +static void spr_upi_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_cleanup_mapping(type, &spr_upi_mapping_group); +} + +static int spr_upi_get_topology(struct intel_uncore_type *type) +{ + return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0); +} + static struct intel_uncore_type spr_uncore_upi = { - SPR_UNCORE_PCI_COMMON_FORMAT(), + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, + .format_group = &spr_uncore_raw_format_group, + .ops = &spr_uncore_pci_ops, .name = "upi", + .attr_update = spr_upi_attr_update, + .get_topology = spr_upi_get_topology, + .set_mapping = spr_upi_set_mapping, + .cleanup_mapping = spr_upi_cleanup_mapping, }; static struct intel_uncore_type spr_uncore_m3upi = { @@ -5986,6 +6355,12 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, to_type->format_group = from_type->format_group; if (from_type->attr_update) to_type->attr_update = from_type->attr_update; + if (from_type->set_mapping) + to_type->set_mapping = from_type->set_mapping; + if (from_type->get_topology) + to_type->get_topology = from_type->get_topology; + if (from_type->cleanup_mapping) + to_type->cleanup_mapping = from_type->cleanup_mapping; } static struct intel_uncore_type ** diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 332d2e6d8ae4..0e849f28a5c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -811,7 +811,7 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*sched_task)(struct perf_event_context *ctx, + void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* @@ -894,12 +894,12 @@ struct x86_pmu { int num_topdown_events; /* - * perf task context (i.e. struct perf_event_context::task_ctx_data) + * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; */ - void (*swap_task_ctx)(struct perf_event_context *prev, - struct perf_event_context *next); + void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); /* * AMD bits @@ -925,7 +925,7 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); - int (*filter_match)(struct perf_event *event); + void (*filter)(struct pmu *pmu, int cpu, bool *ret); /* * Hybrid support * @@ -1180,8 +1180,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs); void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, u64 intel_ctrl); -void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu); - extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; @@ -1306,7 +1304,7 @@ void amd_pmu_lbr_reset(void); void amd_pmu_lbr_read(void); void amd_pmu_lbr_add(struct perf_event *event); void amd_pmu_lbr_del(struct perf_event *event); -void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); @@ -1322,7 +1320,6 @@ void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); -void amd_brs_disable_all(void); int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); @@ -1330,7 +1327,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - perf_sched_cb_inc(event->ctx->pmu); + perf_sched_cb_inc(event->pmu); cpuc->lbr_users++; /* * No need to reset BRS because it is reset @@ -1345,10 +1342,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - perf_sched_cb_dec(event->ctx->pmu); + perf_sched_cb_dec(event->pmu); } -void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); #else static inline int amd_brs_init(void) { @@ -1373,7 +1370,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event) { } -static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { } @@ -1533,7 +1530,7 @@ void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); @@ -1541,10 +1538,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, - struct perf_event_context *next); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, + struct perf_event_pmu_context *next_epc); -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index f49bc3ec76e6..41ef036ebb7b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void) static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; + struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; int ret; ret = hv_common_cpu_init(cpu); @@ -87,34 +87,32 @@ static int hv_cpu_init(unsigned int cpu) if (!hv_vp_assist_page) return 0; - if (!*hvp) { - if (hv_root_partition) { - /* - * For root partition we get the hypervisor provided VP assist - * page, instead of allocating a new page. - */ - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); - *hvp = memremap(msr.pfn << - HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, - PAGE_SIZE, MEMREMAP_WB); - } else { - /* - * The VP assist page is an "overlay" page (see Hyper-V TLFS's - * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed - * out to make sure we always write the EOI MSR in - * hv_apic_eoi_write() *after* the EOI optimization is disabled - * in hv_cpu_die(), otherwise a CPU may not be stopped in the - * case of CPU offlining and the VM will hang. - */ + if (hv_root_partition) { + /* + * For root partition we get the hypervisor provided VP assist + * page, instead of allocating a new page. + */ + rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + } else { + /* + * The VP assist page is an "overlay" page (see Hyper-V TLFS's + * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed + * out to make sure we always write the EOI MSR in + * hv_apic_eoi_write() *after* the EOI optimization is disabled + * in hv_cpu_die(), otherwise a CPU may not be stopped in the + * case of CPU offlining and the VM will hang. + */ + if (!*hvp) *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); - if (*hvp) - msr.pfn = vmalloc_to_pfn(*hvp); - } - WARN_ON(!(*hvp)); - if (*hvp) { - msr.enable = 1; - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); - } + if (*hvp) + msr.pfn = vmalloc_to_pfn(*hvp); + + } + if (!WARN_ON(!(*hvp))) { + msr.enable = 1; + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } return hyperv_init_ghcb(); @@ -464,6 +462,8 @@ void __init hyperv_init(void) BUG_ON(!src); memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE); memunmap(src); + + hv_remap_tsc_clocksource(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -537,8 +537,6 @@ void hyperv_cleanup(void) union hv_x64_msr_hypercall_contents hypercall_msr; union hv_reference_tsc_msr tsc_msr; - unregister_syscore_ops(&hv_syscore_ops); - /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e481056698de..333556a86b2a 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,5 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o - audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3415321c8240..3216da7074ba 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -249,7 +249,6 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); -extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { @@ -258,13 +257,13 @@ static inline int x2apic_enabled(void) #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ -static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ +extern void __init check_x2apic(void); struct irq_data; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b71f4f2ecdd5..c9f4730bb113 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -304,6 +304,10 @@ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ + + +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index cb0ff1055ab1..be8b58da63b9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -152,10 +152,6 @@ do { \ (elf_check_arch_ia32(x) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) -#if __USER32_DS != __USER_DS -# error "The following code assumes __USER32_DS == __USER_DS" -#endif - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index e1c9df9102a5..611fa41711af 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -13,16 +13,9 @@ #ifdef CONFIG_X86_64 # include <uapi/asm/sigcontext.h> # include <asm/user32.h> -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); #else # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame #endif extern void convert_from_fxsr(struct user_i387_ia32_struct *env, diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 908d99b127d3..5061ac98ffa1 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,19 +34,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } -/* - * When a ftrace registered caller is tracing a function that is - * also set by a register_ftrace_direct() call, it needs to be - * differentiated in the ftrace_caller trampoline. To do this, we - * place the direct caller in the ORIG_AX part of pt_regs. This - * tells the ftrace_caller that there's a direct caller. - */ -static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) -{ - /* Emulate a call */ - regs->orig_ax = addr; -} - #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { struct pt_regs regs; @@ -61,9 +48,25 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &fregs->regs; } -#define ftrace_instruction_pointer_set(fregs, _ip) \ +#define ftrace_regs_set_instruction_pointer(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) +#define ftrace_regs_get_instruction_pointer(fregs) \ + ((fregs)->regs.ip) + +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(&(fregs)->regs, n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(&(fregs)->regs) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(&(fregs)->regs) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(&(fregs)->regs, ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(&(fregs)->regs) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + struct ftrace_ops; #define ftrace_graph_func ftrace_graph_func void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, @@ -72,6 +75,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When a ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, we + * place the direct caller in the ORIG_AX part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void +__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + /* Emulate a call */ + regs->orig_ax = addr; +} +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 3089ec352743..6d9368ea3701 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -374,11 +374,20 @@ struct hv_nested_enlightenments_control { struct hv_vp_assist_page { __u32 apic_assist; __u32 reserved1; - __u64 vtl_control[3]; + __u32 vtl_entry_reason; + __u32 vtl_reserved; + __u64 vtl_ret_x64rax; + __u64 vtl_ret_x64rcx; struct hv_nested_enlightenments_control nested_control; __u8 enlighten_vmentry; __u8 reserved2[7]; __u64 current_nested_vmcs; + __u8 synthetic_time_unhalted_timer_expired; + __u8 reserved3[7]; + __u8 virtualization_fault_information[40]; + __u8 reserved4[8]; + __u8 intercept_message[256]; + __u8 vtl_ret_actions[256]; } __packed; struct hv_enlightened_vmcs { diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h new file mode 100644 index 000000000000..388fa81b8f38 --- /dev/null +++ b/arch/x86/include/asm/hyperv_timer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_HYPERV_TIMER_H +#define _ASM_X86_HYPERV_TIMER_H + +#include <asm/msr.h> + +#define hv_get_raw_timer() rdtsc_ordered() + +#endif diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7cc49432187f..7a2ed154a5e1 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,10 +44,6 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -extern struct irq_domain * -arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); - /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) { diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 125c23b7bad3..30c325c235c0 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -7,9 +7,7 @@ #ifdef CONFIG_X86_LOCAL_APIC enum { - /* Allocate contiguous CPU vectors */ - X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, - X86_IRQ_ALLOC_LEGACY = 0x2, + X86_IRQ_ALLOC_LEGACY = 0x1, }; extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 61f0c206bff0..6d502f3efb0f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -19,8 +19,6 @@ typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -#define hv_get_raw_timer() rdtsc_ordered() - void hyperv_vector_handler(struct pt_regs *regs); #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index d71c7e8b738d..935c6d470341 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -62,4 +62,10 @@ typedef struct x86_msi_addr_hi { struct msi_msg; u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); +#define X86_VECTOR_MSI_FLAGS_SUPPORTED \ + (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN) + +#define X86_VECTOR_MSI_FLAGS_REQUIRED \ + (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 10ac52705892..6bcd5d2dd718 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -4,12 +4,7 @@ #include <linux/bits.h> -/* - * CPU model specific register (MSR) numbers. - * - * Do not add new entries to this file unless the definitions are shared - * between multiple compilation units. - */ +/* CPU model specific register (MSR) numbers. */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ @@ -535,6 +530,11 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -640,9 +640,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a @@ -1050,6 +1047,20 @@ #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +/* Resctrl MSRs: */ +/* - Intel: */ +#define MSR_IA32_L3_QOS_CFG 0xc81 +#define MSR_IA32_L2_QOS_CFG 0xc82 +#define MSR_IA32_QM_EVTSEL 0xc8d +#define MSR_IA32_QM_CTR 0xc8e +#define MSR_IA32_PQR_ASSOC 0xc8f +#define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_IA32_L2_CBM_BASE 0xd10 +#define MSR_IA32_MBA_THRTL_BASE 0xd50 + +/* - AMD: */ +#define MSR_IA32_MBA_BW_BASE 0xc0000200 + /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c936ce9f0c47..dfdb103ae4f6 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -321,7 +321,7 @@ static inline void indirect_branch_prediction_barrier(void) /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; DECLARE_PER_CPU(u64, x86_spec_ctrl_current); -extern void write_spec_ctrl_current(u64 val, bool force); +extern void update_spec_ctrl_cond(u64 val); extern u64 spec_ctrl_current(void); /* diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 736793d65bcb..b40c462b4af3 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -21,7 +21,7 @@ struct pci_sysdata { #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI void *fwnode; /* IRQ domain for MSI assignment */ #endif #if IS_ENABLED(CONFIG_VMD) @@ -52,7 +52,7 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) { return to_pci_sysdata(bus)->fwnode; @@ -92,6 +92,7 @@ void pcibios_scan_root(int bus); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev); #define HAVE_PCI_MMAP #define arch_can_pci_mmap_wc() pat_enabled() diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5059799bebe3..286a71810f9e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd) return pmd_flags(pmd) & _PAGE_DIRTY; } +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; @@ -1438,6 +1439,14 @@ static inline bool arch_has_hw_pte_young(void) return true; } +#ifdef CONFIG_XEN_PV +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young +static inline bool arch_has_hw_nonleaf_pmd_young(void) +{ + return !cpu_feature_enabled(X86_FEATURE_XENPV); +} +#endif + #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) { diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 60ece592b220..dbb38a6b4dfb 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -37,7 +37,7 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .spinlock.text;" +asm (".pushsection .spinlock.text, \"ax\";" ".globl " PV_UNLOCK ";" ".type " PV_UNLOCK ", @function;" ".align 4,0x90;" diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index d24b04ebf950..52788f79786f 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,8 +7,6 @@ #include <linux/sched.h> #include <linux/jump_label.h> -#define IA32_PQR_ASSOC 0x0c8f - /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -16,8 +14,8 @@ * @default_rmid: The user assigned Resource Monitoring ID * @default_closid: The user assigned cached Class Of Service ID * - * The upper 32 bits of IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always * contains both parts, so we need to cache them. This also * stores the user configured per cpu CLOSID and RMID. * @@ -77,7 +75,7 @@ static void __resctrl_sched_in(void) if (closid != state->cur_closid || rmid != state->cur_rmid) { state->cur_closid = closid; state->cur_rmid = rmid; - wrmsr(IA32_PQR_ASSOC, rmid, closid); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid); } } diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 2e7890dd58a4..c390a672d560 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -135,6 +135,7 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ @@ -210,7 +211,6 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) -#define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index eae20fa52b93..6a0069761508 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -115,17 +115,36 @@ enum sgx_miscselect { * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. + * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * asynchronous exit has occurred. */ enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + /* BIT(3) is reserved */ + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + /* BIT(6) is for CET */ + SGX_ATTR_KSS = BIT(7), + /* BIT(8) is reserved */ + /* BIT(9) is reserved */ + SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10), }; -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \ + BIT_ULL(6) | \ + BIT_ULL(8) | \ + BIT_ULL(9) | \ + GENMASK_ULL(63, 11)) + +#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \ + SGX_ATTR_MODE64BIT | \ + SGX_ATTR_KSS | \ + SGX_ATTR_ASYNC_EXIT_NOTIFY) + +#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \ + SGX_ATTR_EINITTOKENKEY) /** * struct sgx_secs - SGX Enclave Control Structure (SECS) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 65e667279e0f..e770c4fc47f4 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -15,4 +15,13 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, + void __user **fpstate); + +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs); +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2dfb5fea13af..4a4043ca6493 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -28,11 +28,6 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u -#ifndef CONFIG_COMPAT -#define compat_sigset_t compat_sigset_t -typedef sigset_t compat_sigset_t; -#endif - #endif /* __ASSEMBLY__ */ #include <uapi/asm/signal.h> #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 020c81a7c729..28d889c9aa16 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -67,6 +67,8 @@ void tdx_safe_halt(void); bool tdx_early_handle_ve(struct pt_regs *regs); +int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c..cceaafdd2d84 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,7 +44,7 @@ obj-y += head_$(BITS).o obj-y += head$(BITS).o obj-y += ebda.o obj-y += platform-quirks.o -obj-y += process_$(BITS).o signal.o +obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o @@ -54,7 +54,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_ia32.o -obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5cadcea035e0..a9bea860e22a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -624,7 +624,7 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) #else -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1608,7 +1608,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, default: BUG_ON(len != insn.length); - }; + } switch (tp->opcode) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c6876d3ea4b1..20d9a604da7c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1931,16 +1931,19 @@ void __init check_x2apic(void) } } #else /* CONFIG_X86_X2APIC */ -static int __init validate_x2apic(void) +void __init check_x2apic(void) { if (!apic_is_x2apic_enabled()) - return 0; + return; /* - * Checkme: Can we simply turn off x2apic here instead of panic? + * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC? */ - panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n"); + pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); + pr_err("Disabling APIC, expect reduced performance and functionality.\n"); + + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); } -early_initcall(validate_x2apic); static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7517eb05bdc1..35d5b8fb18ef 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -142,70 +142,139 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) return ret; } -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. +/** + * pci_dev_has_default_msi_parent_domain - Check whether the device has the default + * MSI parent domain associated + * @dev: Pointer to the PCI device */ -static struct irq_chip pci_msi_controller = { - .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev) +{ + struct irq_domain *domain = dev_get_msi_domain(&dev->dev); -int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, - msi_alloc_info_t *arg) + if (!domain) + domain = dev_get_msi_domain(&dev->bus->dev); + if (!domain) + return false; + + return domain == x86_vector_domain; +} + +/** + * x86_msi_prepare - Setup of msi_alloc_info_t for allocations + * @domain: The domain for which this setup happens + * @dev: The device for which interrupts are allocated + * @nvec: The number of vectors to allocate + * @alloc: The allocation info structure to initialize + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. It is always invoked from the + * top level interrupt domain. The domain specific allocation + * functionality is determined via the @domain's bus token which allows to + * map the X86 specific allocation type. + */ +static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *alloc) { - init_irq_alloc_info(arg, NULL); - if (to_pci_dev(dev)->msix_enabled) { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; - } else { - arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; - arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + struct msi_domain_info *info = domain->host_data; + + init_irq_alloc_info(alloc, NULL); + + switch (info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; + case DOMAIN_BUS_PCI_DEVICE_MSIX: + case DOMAIN_BUS_PCI_DEVICE_IMS: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + return 0; + default: + return -EINVAL; } - - return 0; } -EXPORT_SYMBOL_GPL(pci_msi_prepare); -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_prepare = pci_msi_prepare, -}; +/** + * x86_init_dev_msi_info - Domain info setup for MSI domains + * @dev: The device for which the domain should be created + * @domain: The (root) domain providing this callback + * @real_parent: The real parent domain of the to initialize domain + * @info: The domain info for the to initialize domain + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. The domain specific functionality + * is determined via the @real_parent. + */ +static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + + /* MSI parent domain specific settings */ + switch (real_parent->bus_token) { + case DOMAIN_BUS_ANY: + /* Only the vector domain can have the ANY token */ + if (WARN_ON_ONCE(domain != real_parent)) + return false; + info->chip->irq_set_affinity = msi_set_affinity; + /* See msi_set_affinity() for the gory details */ + info->flags |= MSI_FLAG_NOMASK_QUIRK; + break; + case DOMAIN_BUS_DMAR: + case DOMAIN_BUS_AMDVI: + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* Is the target supported? */ + switch(info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: + break; + case DOMAIN_BUS_PCI_DEVICE_IMS: + if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) + return false; + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* + * Mask out the domain specific MSI feature flags which are not + * supported by the real parent. + */ + info->flags &= pops->supported_flags; + /* Enforce the required flags */ + info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED; + + /* This is always invoked from the top level MSI domain! */ + info->ops->msi_prepare = x86_msi_prepare; + + info->chip->irq_ack = irq_chip_ack_parent; + info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; + info->chip->flags |= IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP; -static struct msi_domain_info pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", + info->handler = handle_edge_irq; + info->handler_name = "edge"; + + return true; +} + +static const struct msi_parent_ops x86_vector_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED, + .init_dev_msi_info = x86_init_dev_msi_info, }; struct irq_domain * __init native_create_pci_msi_domain(void) { - struct fwnode_handle *fn; - struct irq_domain *d; - if (disable_apic) return NULL; - fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (!fn) - return NULL; - - d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - x86_vector_domain); - if (!d) { - irq_domain_free_fwnode(fn); - pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); - } else { - d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; - } - return d; + x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops; + return x86_vector_domain; } void __init x86_create_pci_msi_domain(void) @@ -213,41 +282,19 @@ void __init x86_create_pci_msi_domain(void) x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } -#ifdef CONFIG_IRQ_REMAP -static struct irq_chip pci_msi_ir_controller = { - .name = "IR-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_AFFINITY_PRE_STARTUP, -}; - -static struct msi_domain_info pci_msi_ir_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &pci_msi_ir_controller, - .handler = handle_edge_irq, - .handler_name = "edge", -}; - -struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, - const char *name, int id) +/* Keep around for hyperV */ +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) { - struct fwnode_handle *fn; - struct irq_domain *d; + init_irq_alloc_info(arg, NULL); - fn = irq_domain_alloc_named_id_fwnode(name, id); - if (!fn) - return NULL; - d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - if (!d) - irq_domain_free_fwnode(fn); - return d; + if (to_pci_dev(dev)->msix_enabled) + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + else + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; } -#endif +EXPORT_SYMBOL_GPL(pci_msi_prepare); #ifdef CONFIG_DMAR_TABLE /* diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3e6f6b448f6a..c1efebd27e6c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -539,10 +539,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if (disable_apic) return -ENXIO; - /* Currently vector allocator can't guarantee contiguous allocations */ - if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) - return -ENOSYS; - /* * Catch any attempt to touch the cascade interrupt on a PIC * equipped system. diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 860b60273df3..c75d75b9f11a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -965,8 +963,8 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3e3230cccaa7..6daf84229548 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -60,11 +60,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); static DEFINE_MUTEX(spec_ctrl_mutex); +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + /* * Keep track of the SPEC_CTRL MSR value for the current task, which may differ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -void write_spec_ctrl_current(u64 val, bool force) +void update_spec_ctrl_cond(u64 val) { if (this_cpu_read(x86_spec_ctrl_current) == val) return; @@ -75,7 +82,7 @@ void write_spec_ctrl_current(u64 val, bool force) * When KERNEL_IBRS this MSR is written on return-to-user, unless * forced the update can be delayed until that time. */ - if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) wrmsrl(MSR_IA32_SPEC_CTRL, val); } @@ -1328,7 +1335,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) if (ia32_cap & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -1450,7 +1457,7 @@ static void __init spectre_v2_select_mitigation(void) if (spectre_v2_in_ibrs_mode(mode)) { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } switch (mode) { @@ -1564,7 +1571,7 @@ static void __init spectre_v2_select_mitigation(void) static void update_stibp_msr(void * __unused) { u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); - write_spec_ctrl_current(val, true); + update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1797,7 +1804,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); } } @@ -2048,7 +2055,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - write_spec_ctrl_current(x86_spec_ctrl_base, true); + update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index c881bcafba7d..d95221117129 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 21fd425088fe..c393b8773ace 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2d7ea5480ec3..427899650483 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1034,8 +1034,32 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem); +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1146,12 +1170,20 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } -static void __split_lock_reenable(struct work_struct *work) +static void __split_lock_reenable_unlock(struct work_struct *work) { sld_update_msr(true); up(&buslock_sem); } +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + /* * If a CPU goes offline with pending delayed work to re-enable split lock * detection then the delayed work will be executed on some other CPU. That @@ -1169,10 +1201,9 @@ static int splitlock_cpu_offline(unsigned int cpu) return 0; } -static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); - static void split_lock_warn(unsigned long ip) { + struct delayed_work *work; int cpu; if (!current->reported_split_lock) @@ -1180,14 +1211,26 @@ static void split_lock_warn(unsigned long ip) current->comm, current->pid, ip); current->reported_split_lock = 1; - /* misery factor #1, sleep 10ms before trying to execute split lock */ - if (msleep_interruptible(10) > 0) - return; - /* Misery factor #2, only allow one buslocked disabled core at a time */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + cpu = get_cpu(); - schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1c87501e0fa3..10fb5b5c9efa 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) return status & MCI_STATUS_DEFERRED; } +static bool _log_error_deferred(unsigned int bank, u32 misc) +{ + if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), misc)) + return false; + + /* + * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. + * Return true here to avoid accessing these registers. + */ + if (!mce_flags.smca) + return true; + + /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return true; +} + /* * We have three scenarios for checking for Deferred errors: * @@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) */ static void log_error_deferred(unsigned int bank) { - bool defrd; - - defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), 0); - - if (!mce_flags.smca) - return; - - /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ - if (defrd) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + if (_log_error_deferred(bank, 0)) return; - } /* * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check @@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); + _log_error_deferred(bank, misc); } static void log_and_reset_block(struct threshold_block *block) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 00483d1c27e4..c4477162c07d 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -203,6 +203,11 @@ static struct severity { BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) ), MCESEV( + PANIC, "Uncorrected in kernel", + BITSET(MCI_STATUS_UC), + KERNEL + ), + MCESEV( UC, "Uncorrected", BITSET(MCI_STATUS_UC) ), @@ -391,9 +396,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char *msg = s->msg; s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; - return s->sev; } } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 831613959a92..46668e255421 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -475,6 +475,12 @@ static bool __init ms_hyperv_x2apic_available(void) * (logically) generates MSIs directly to the system APIC irq domain. * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the * pci-hyperv host bridge. + * + * Note: for a Hyper-V root partition, this will always return false. + * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by + * default, they are implemented as intercepts by the Windows Hyper-V stack. + * Even a nested root partition (L2 root) will not get them because the + * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3266ea36667c..c98e52ff5f20 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -575,7 +575,7 @@ static void clear_closid_rmid(int cpu) state->default_rmid = 0; state->cur_closid = 0; state->cur_rmid = 0; - wrmsr(IA32_PQR_ASSOC, 0, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); } static int resctrl_online_cpu(unsigned int cpu) @@ -828,7 +828,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = false; - r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -849,7 +848,6 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { r->cache.arch_has_sparse_bitmaps = true; - r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1dafbdc5ac31..1df0e3262bca 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -105,8 +105,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - if ((!r->cache.arch_has_empty_bitmaps && val == 0) || - val > r->default_ctrl) { + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5f7128686cfd..5ebd28e6aa0c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -8,16 +8,6 @@ #include <linux/fs_context.h> #include <linux/jump_label.h> -#define MSR_IA32_L3_QOS_CFG 0xc81 -#define MSR_IA32_L2_QOS_CFG 0xc82 -#define MSR_IA32_L3_CBM_BASE 0xc90 -#define MSR_IA32_L2_CBM_BASE 0xd10 -#define MSR_IA32_MBA_THRTL_BASE 0xd50 -#define MSR_IA32_MBA_BW_BASE 0xc0000200 - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d961ae3ed96e..ba8d0763b36b 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -477,7 +477,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -513,7 +513,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fc01f81f6e2a..f53944fb8f7f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 1ec20807de1e..68f8b18d2278 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -160,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, return ret; pginfo.addr = encl_page->desc & PAGE_MASK; - pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pcmd_page = kmap_atomic(b.pcmd); + pginfo.contents = (unsigned long)kmap_local_page(b.contents); + pcmd_page = kmap_local_page(b.pcmd); pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) @@ -187,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, */ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); - kunmap_atomic(pcmd_page); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local(pcmd_page); + kunmap_local((void *)(unsigned long)pginfo.contents); get_page(b.pcmd); sgx_encl_put_backing(&b); @@ -197,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); - pcmd_page = kmap_atomic(b.pcmd); + pcmd_page = kmap_local_page(b.pcmd); if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) pr_warn("PCMD page not empty after truncate.\n"); - kunmap_atomic(pcmd_page); + kunmap_local(pcmd_page); } put_page(b.pcmd); @@ -680,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = { void sgx_encl_release(struct kref *ref) { struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); struct sgx_va_page *va_page; struct sgx_encl_page *entry; - unsigned long index; + unsigned long count = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); - xa_for_each(&encl->page_array, index, entry) { + xas_lock(&xas); + xas_for_each(&xas, entry, max_page_index) { if (entry->epc_page) { /* * The page and its radix tree entry cannot be freed @@ -699,9 +703,20 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); - /* Invoke scheduler to prevent soft lockups. */ - cond_resched(); + /* + * Invoke scheduler on every XA_CHECK_SCHED iteration + * to prevent soft lockups. + */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + + cond_resched(); + + xas_lock(&xas); + } } + xas_unlock(&xas); xa_destroy(&encl->page_array); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index ebe79d60619f..21ca0a831b70 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; - encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + encl->attributes_mask = SGX_ATTR_UNPRIV_MASK; /* Set only after completion, as encl->lock has not been taken. */ set_bit(SGX_ENCL_CREATED, &encl->flags); @@ -221,11 +221,11 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.metadata = (unsigned long)secinfo; - pginfo.contents = (unsigned long)kmap_atomic(src_page); + pginfo.contents = (unsigned long)kmap_local_page(src_page); ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); - kunmap_atomic((void *)pginfo.contents); + kunmap_local((void *)pginfo.contents); put_page(src_page); return ret ? -EIO : 0; @@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl, if (!length || !IS_ALIGNED(length, PAGE_SIZE)) return -EINVAL; + if (offset + length < offset) + return -EINVAL; + if (offset + length - PAGE_SIZE >= encl->size) return -EINVAL; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 0aad028f04d4..e5a37b6e9aa5 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -165,17 +165,17 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, pginfo.addr = 0; pginfo.secs = 0; - pginfo.contents = (unsigned long)kmap_atomic(backing->contents); - pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + pginfo.contents = (unsigned long)kmap_local_page(backing->contents); + pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); set_page_dirty(backing->pcmd); set_page_dirty(backing->contents); - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + kunmap_local((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); - kunmap_atomic((void *)(unsigned long)pginfo.contents); + kunmap_local((void *)(unsigned long)pginfo.contents); return ret; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index ec7bbac3a9f2..8009c8346d8f 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -58,24 +58,6 @@ static void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool tsx_ctrl_is_supported(void) -{ - u64 ia32_cap = x86_read_arch_cap_msr(); - - /* - * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this - * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. - * - * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a - * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES - * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get - * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, - * tsx= cmdline requests will do nothing on CPUs without - * MSR_IA32_TSX_CTRL support. - */ - return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); -} - static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) { if (boot_cpu_has_bug(X86_BUG_TAA)) @@ -135,7 +117,7 @@ static void tsx_clear_cpuid(void) rdmsrl(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; wrmsrl(MSR_TSX_FORCE_ABORT, msr); - } else if (tsx_ctrl_is_supported()) { + } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { rdmsrl(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; wrmsrl(MSR_IA32_TSX_CTRL, msr); @@ -158,7 +140,8 @@ static void tsx_dev_mode_disable(void) u64 mcu_opt_ctrl; /* Check if RTM_ALLOW exists */ - if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() || + if (!boot_cpu_has_bug(X86_BUG_TAA) || + !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) || !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; @@ -191,7 +174,20 @@ void __init tsx_init(void) return; } - if (!tsx_ctrl_is_supported()) { + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) { + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + } else { tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; return; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 5cd51f25f446..28da5dd83fc0 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - BUG(); -} - void __init add_dtb(u64 data) { initial_dtb = data + offsetof(struct setup_data, data); @@ -167,7 +162,14 @@ static void __init dtb_lapic_setup(void) return; } smp_found_config = 1; - pic_mode = 1; + if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { + pr_info("Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } else { + pr_info("IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } + register_lapic_address(lapic_addr); } @@ -248,7 +250,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); + pr_err("Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); @@ -265,7 +267,7 @@ static void __init dtb_ioapic_setup(void) of_ioapic = 1; return; } - printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); + pr_err("Error: No information about IO-APIC in OF.\n"); } #else static void __init dtb_ioapic_setup(void) {} diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9417d5aa7305..16f9814c9be0 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -94,17 +94,7 @@ static inline unsigned long espfix_base_addr(unsigned int cpu) static void init_espfix_random(void) { - unsigned long rand; - - /* - * This is run before the entropy pools are initialized, - * but this is hopefully better than nothing. - */ - if (!arch_get_random_longs(&rand, 1)) { - /* The constant is an arbitrary large prime */ - rand = rdtsc(); - rand *= 0xc345c6b72fd16123UL; - } + unsigned long rand = get_random_long(); slot_random = rand % ESPFIX_STACKS_PER_PAGE; page_random = (rand / ESPFIX_STACKS_PER_PAGE) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3b28c5b25e12..9baa89a8877d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; - struct pkru_state *xpkru; - int ret; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) @@ -406,16 +404,15 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; - ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); - if (ret) - return ret; + /* + * Nullify @vpkru to preserve its current value if PKRU's bit isn't set + * in the header. KVM's odd ABI is to leave PKRU untouched in this + * case (all other components are eventually re-initialized). + */ + if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) + vpkru = NULL; - /* Retrieve PKRU if not in init state */ - if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { - xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); - *vpkru = xpkru->pkru; - } - return 0; + return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ @@ -605,9 +602,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); + fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); - fpregs_unlock(); /* * Children never inherit PASID state. diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 8946f89761cc..851eb13edc01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -133,9 +133,6 @@ static void __init fpu__init_system_generic(void) fpu__init_system_mxcsr(); } -/* Get alignment of the TYPE. */ -#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) - /* * Enforce that 'MEMBER' is the last field of 'TYPE'. * @@ -143,8 +140,8 @@ static void __init fpu__init_system_generic(void) * because that's how C aligns structs. */ #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \ - TYPE_ALIGN(TYPE))) + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) /* * We append the 'struct fpu' to the task_struct: diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 75ffaef8c299..6d056b68f4ed 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru); out: vfree(tmpbuf); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 91d4b6de58ab..558076dbde5b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 59e543b95a3c..714166cc25f2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void) } } -#define XSTATE_WARN_ON(x) do { \ - if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ +#define XSTATE_WARN_ON(x, fmt, ...) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) @@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr) (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { - WARN_ONCE(1, "no structure for xstate: %d\n", nr); - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; @@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { - XSTATE_WARN_ON(1); + XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); - XSTATE_WARN_ON(size != kernel_size); + XSTATE_WARN_ON(size != kernel_size, + "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } @@ -1200,8 +1200,36 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } +/** + * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate + * @fpstate: The fpstate buffer to copy to + * @kbuf: The UABI format buffer, if it comes from the kernel + * @ubuf: The UABI format buffer, if it comes from userspace + * @pkru: The location to write the PKRU value to + * + * Converts from the UABI format into the kernel internal hardware + * dependent format. + * + * This function ultimately has three different callers with distinct PKRU + * behavior. + * 1. When called from sigreturn the PKRU register will be restored from + * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to + * @fpstate is sufficient to cover this case, but the caller will also + * pass a pointer to the thread_struct's pkru field in @pkru and updating + * it is harmless. + * 2. When called from ptrace the PKRU register will be restored from the + * thread_struct's pkru field. A pointer to that is passed in @pkru. + * The kernel will restore it manually, so the XRSTOR behavior that resets + * the PKRU register to the hardware init value (0) if the corresponding + * xfeatures bit is not set is emulated here. + * 3. When called from KVM the PKRU register will be restored from the vcpu's + * pkru field. A pointer to that is passed in @pkru. KVM hasn't used + * XRSTOR and hasn't had the PKRU resetting behavior described above. To + * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures + * bit is not set. + */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, - const void __user *ubuf) + const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; @@ -1250,6 +1278,20 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } } + if (hdr.xfeatures & XFEATURE_MASK_PKRU) { + struct pkru_state *xpkru; + + xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); + *pkru = xpkru->pkru; + } else { + /* + * KVM may pass NULL here to indicate that it does not need + * PKRU updated. + */ + if (pkru) + *pkru = 0; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1268,9 +1310,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { - return copy_uabi_to_xstate(fpstate, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* @@ -1278,10 +1320,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, +int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(fpstate, NULL, ubuf); + return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 5ad47031383b..a4ecb04d8d64 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); -extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); -extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); +extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9b7acc9c7874..67c8ed99144b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -261,16 +261,6 @@ SYM_FUNC_START(startup_32_smp) addl $__PAGE_OFFSET, %esp /* - * start system 32-bit setup. We need to re-do some of the things done - * in 16-bit mode for the "real" operations. - */ - movl setup_once_ref,%eax - andl %eax,%eax - jz 1f # Did we do this already? - call *%eax -1: - -/* * Check if it is 486 */ movb $4,X86 # at least 486 @@ -331,18 +321,7 @@ SYM_FUNC_END(startup_32_smp) #include "verify_cpu.S" -/* - * setup_once - * - * The setup work we only want to run on the BSP. - * - * Warning: %esi is live across this function. - */ __INIT -setup_once: - andl $0,setup_once_ref /* Once is enough, thanks */ - RET - SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs @@ -458,7 +437,6 @@ SYM_DATA(early_recursion_flag, .long 0) __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -SYM_DATA(setup_once_ref, .long setup_once) #ifdef CONFIG_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 15aefa3f3e18..3aa5304200c5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -407,7 +407,7 @@ struct legacy_pic null_legacy_pic = { .make_irq = legacy_pic_uint_noop, }; -struct legacy_pic default_legacy_pic = { +static struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, .mask = mask_8259A_irq, diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a98687642dd0..d85a6980e263 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -251,14 +251,12 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) @@ -302,12 +300,13 @@ int module_finalize(const Elf_Ehdr *hdr, void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); } - if (locks && text) { + if (locks) { void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; + void *text = me->core_layout.base; + void *text_end = text + me->core_layout.text_size; alternatives_smp_module_add(me, me->name, lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); + text, text_end); } if (orc && orc_ip) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62671ccf0404..40d156a31676 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - write_spec_ctrl_current(msr, false); + update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 37c12fb92906..dfaa270a7cc9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -44,16 +44,35 @@ #include "tls.h" -enum x86_regset { - REGSET_GENERAL, - REGSET_FP, - REGSET_XFP, - REGSET_IOPERM64 = REGSET_XFP, - REGSET_XSTATE, - REGSET_TLS, - REGSET_IOPERM32, +enum x86_regset_32 { + REGSET32_GENERAL, + REGSET32_FP, + REGSET32_XFP, + REGSET32_XSTATE, + REGSET32_TLS, + REGSET32_IOPERM, }; +enum x86_regset_64 { + REGSET64_GENERAL, + REGSET64_FP, + REGSET64_IOPERM, + REGSET64_XSTATE, +}; + +#define REGSET_GENERAL \ +({ \ + BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ + REGSET32_GENERAL; \ +}) + +#define REGSET_FP \ +({ \ + BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ + REGSET32_FP; \ +}) + + struct pt_regs_offset { const char *name; int offset; @@ -788,13 +807,13 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, + REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif @@ -1086,13 +1105,13 @@ static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, 0, + REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); @@ -1215,29 +1234,38 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .regset_get = genregs_get, .set = genregs_set + [REGSET64_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = genregs_get, + .set = genregs_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct fxregs_state) / sizeof(long), - .size = sizeof(long), .align = sizeof(long), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET64_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct fxregs_state) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET64_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_IOPERM64] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_LONGS, - .size = sizeof(long), .align = sizeof(long), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET64_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), + .align = sizeof(long), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1256,43 +1284,57 @@ static const struct user_regset_view user_x86_64_view = { #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { - [REGSET_GENERAL] = { - .core_note_type = NT_PRSTATUS, - .n = sizeof(struct user_regs_struct32) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .regset_get = genregs32_get, .set = genregs32_set + [REGSET32_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = genregs32_get, + .set = genregs32_set }, - [REGSET_FP] = { - .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set + [REGSET32_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_fpregs_active, + .regset_get = fpregs_get, + .set = fpregs_set }, - [REGSET_XFP] = { - .core_note_type = NT_PRXFPREG, - .n = sizeof(struct fxregs_state) / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + [REGSET32_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct fxregs_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set }, - [REGSET_XSTATE] = { - .core_note_type = NT_X86_XSTATE, - .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .regset_get = xstateregs_get, - .set = xstateregs_set + [REGSET32_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set }, - [REGSET_TLS] = { - .core_note_type = NT_386_TLS, - .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, - .size = sizeof(struct user_desc), - .align = sizeof(struct user_desc), - .active = regset_tls_active, - .regset_get = regset_tls_get, .set = regset_tls_set + [REGSET32_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, + .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .regset_get = regset_tls_get, + .set = regset_tls_set }, - [REGSET_IOPERM32] = { - .core_note_type = NT_386_IOPERM, - .n = IO_BITMAP_BYTES / sizeof(u32), - .size = sizeof(u32), .align = sizeof(u32), - .active = ioperm_active, .regset_get = ioperm_get + [REGSET32_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = ioperm_active, + .regset_get = ioperm_get }, }; @@ -1311,10 +1353,10 @@ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 - x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION - x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); + x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9c7265b524c7..1504eb8d25aa 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,180 +37,27 @@ #include <asm/sighandling.h> #include <asm/vm86.h> -#ifdef CONFIG_X86_64 -#include <linux/compat.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> -#include <asm/fpu/xstate.h> -#endif /* CONFIG_X86_64 */ - #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> -#ifdef CONFIG_X86_64 -/* - * If regs->ss will cause an IRET fault, change it. Otherwise leave it - * alone. Using this generally makes no sense unless - * user_64bit_mode(regs) would return true. - */ -static void force_valid_ss(struct pt_regs *regs) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { - u32 ar; - asm volatile ("lar %[old_ss], %[ar]\n\t" - "jz 1f\n\t" /* If invalid: */ - "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ - "1:" - : [ar] "=r" (ar) - : [old_ss] "rm" ((u16)regs->ss)); - - /* - * For a valid 64-bit user context, we need DPL 3, type - * read-write data or read-write exp-down data, and S and P - * set. We can't use VERW because VERW doesn't check the - * P bit. - */ - ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; - if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && - ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) - regs->ss = __USER_DS; + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) -#else -# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) -#endif -static bool restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static inline int is_ia32_frame(struct ksignal *ksig) { - struct sigcontext sc; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return false; - -#ifdef CONFIG_X86_32 - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; -#endif /* CONFIG_X86_32 */ - - regs->bx = sc.bx; - regs->cx = sc.cx; - regs->dx = sc.dx; - regs->si = sc.si; - regs->di = sc.di; - regs->bp = sc.bp; - regs->ax = sc.ax; - regs->sp = sc.sp; - regs->ip = sc.ip; - -#ifdef CONFIG_X86_64 - regs->r8 = sc.r8; - regs->r9 = sc.r9; - regs->r10 = sc.r10; - regs->r11 = sc.r11; - regs->r12 = sc.r12; - regs->r13 = sc.r13; - regs->r14 = sc.r14; - regs->r15 = sc.r15; -#endif /* CONFIG_X86_64 */ - - /* Get CS/SS and force CPL3 */ - regs->cs = sc.cs | 0x03; - regs->ss = sc.ss | 0x03; - - regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); - /* disable syscall checks */ - regs->orig_ax = -1; - -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - - return fpu__restore_sig((void __user *)sc.fpstate, - IS_ENABLED(CONFIG_X86_32)); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static __always_inline int -__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +static inline int is_x32_frame(struct ksignal *ksig) { -#ifdef CONFIG_X86_32 - unsigned int gs; - savesegment(gs, gs); - - unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); - unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); - unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); - unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(regs->di, &sc->di, Efault); - unsafe_put_user(regs->si, &sc->si, Efault); - unsafe_put_user(regs->bp, &sc->bp, Efault); - unsafe_put_user(regs->sp, &sc->sp, Efault); - unsafe_put_user(regs->bx, &sc->bx, Efault); - unsafe_put_user(regs->dx, &sc->dx, Efault); - unsafe_put_user(regs->cx, &sc->cx, Efault); - unsafe_put_user(regs->ax, &sc->ax, Efault); -#ifdef CONFIG_X86_64 - unsafe_put_user(regs->r8, &sc->r8, Efault); - unsafe_put_user(regs->r9, &sc->r9, Efault); - unsafe_put_user(regs->r10, &sc->r10, Efault); - unsafe_put_user(regs->r11, &sc->r11, Efault); - unsafe_put_user(regs->r12, &sc->r12, Efault); - unsafe_put_user(regs->r13, &sc->r13, Efault); - unsafe_put_user(regs->r14, &sc->r14, Efault); - unsafe_put_user(regs->r15, &sc->r15, Efault); -#endif /* CONFIG_X86_64 */ - - unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); - unsafe_put_user(current->thread.error_code, &sc->err, Efault); - unsafe_put_user(regs->ip, &sc->ip, Efault); -#ifdef CONFIG_X86_32 - unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); - unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); -#else /* !CONFIG_X86_32 */ - unsafe_put_user(regs->flags, &sc->flags, Efault); - unsafe_put_user(regs->cs, &sc->cs, Efault); - unsafe_put_user(0, &sc->gs, Efault); - unsafe_put_user(0, &sc->fs, Efault); - unsafe_put_user(regs->ss, &sc->ss, Efault); -#endif /* CONFIG_X86_32 */ - - unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - - /* non-iBCS2 extensions.. */ - unsafe_put_user(mask, &sc->oldmask, Efault); - unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); - return 0; -Efault: - return -EFAULT; + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } -#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ -do { \ - if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ - goto label; \ -} while(0); - -#define unsafe_put_sigmask(set, frame, label) \ - unsafe_put_user(*(__u64 *)(set), \ - (__u64 __user *)&(frame)->uc.uc_sigmask, \ - label) - /* * Set up a signal frame. */ @@ -223,24 +70,12 @@ do { \ /* * Determine which stack to use.. */ -static unsigned long align_sigframe(unsigned long sp) -{ -#ifdef CONFIG_X86_32 - /* - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ - sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; -#else /* !CONFIG_X86_32 */ - sp = round_down(sp, FRAME_ALIGNMENT) - 8; -#endif - return sp; -} - -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { + struct k_sigaction *ka = &ksig->ka; + int ia32_frame = is_ia32_frame(ksig); /* Default to using normal stack */ bool nested_altstack = on_sig_stack(regs->sp); bool entering_altstack = false; @@ -249,7 +84,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long buf_fx = 0; /* redzone */ - if (IS_ENABLED(CONFIG_X86_64)) + if (!ia32_frame) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ @@ -263,7 +98,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; entering_altstack = true; } - } else if (IS_ENABLED(CONFIG_X86_32) && + } else if (ia32_frame && !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -273,11 +108,19 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, entering_altstack = true; } - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); + sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); *fpstate = (void __user *)sp; - sp = align_sigframe(sp - frame_size); + sp -= frame_size; + + if (ia32_frame) + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; + else + sp = round_down(sp, FRAME_ALIGNMENT) - 8; /* * If we are on the alternate signal stack and would overflow it, don't. @@ -300,391 +143,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)sp; } -#ifdef CONFIG_X86_32 -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - -static int -__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); - unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); - if (current->mm->context.vdso) - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_sigreturn; - else - restorer = &frame->retcode; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - - /* Set up to return from userspace. */ - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); - user_access_end(); - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = 0; - regs->cx = 0; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - void __user *fp = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - unsafe_put_user(sig, &frame->sig, Efault); - unsafe_put_user(&frame->info, &frame->pinfo, Efault); - unsafe_put_user(&frame->uc, &frame->puc, Efault); - - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); - else - unsafe_put_user(0, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, &frame->pretcode, Efault); - - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ksig->ka.sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = (unsigned long)&frame->info; - regs->cx = (unsigned long)&frame->uc; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -Efault: - user_access_end(); - return -EFAULT; -} -#else /* !CONFIG_X86_32 */ -static unsigned long frame_uc_flags(struct pt_regs *regs) -{ - unsigned long flags; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) - flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; - else - flags = UC_SIGCONTEXT_SS; - - if (likely(user_64bit_mode(regs))) - flags |= UC_STRICT_RESTORE_SS; - - return flags; -} - -static int __setup_rt_frame(int sig, struct ksignal *ksig, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - unsigned long uc_flags; - - /* x86-64 should always use SA_RESTORER. */ - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* - * Set up the CS and SS registers to run signal handlers in - * 64-bit mode, even if the handler happens to be interrupting - * 32-bit or 16-bit code. - * - * SS is subtle. In 64-bit mode, we don't need any particular - * SS descriptor, but we do need SS to be valid. It's possible - * that the old SS is entirely bogus -- this can happen if the - * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatibility issue here: DOSEMU - * relies on the contents of the SS register indicating the - * SS value at the time of the signal, even though that code in - * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU - * avoids relying on sigreturn to restore SS; instead it uses - * a trampoline.) So we do our best: if the old SS was valid, - * we keep it. Otherwise we replace it. - */ - regs->cs = __USER_CS; - - if (unlikely(regs->ss != __USER_DS)) - force_valid_ss(regs); - - return 0; - -Efault: - user_access_end(); - return -EFAULT; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_X32_ABI -static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - struct compat_siginfo new; - - copy_siginfo_to_external32(&new, from); - if (from->si_signo == SIGCHLD) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } - if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) - return -EFAULT; - return 0; -} - -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -{ - if (in_x32_syscall()) - return x32_copy_siginfo_to_user(to, from); - return __copy_siginfo_to_user32(to, from); -} -#endif /* CONFIG_X86_X32_ABI */ - -static int x32_setup_rt_frame(struct ksignal *ksig, - compat_sigset_t *set, - struct pt_regs *regs) -{ -#ifdef CONFIG_X86_X32_ABI - struct rt_sigframe_x32 __user *frame; - unsigned long uc_flags; - void __user *restorer; - void __user *fp = NULL; - - if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) - return -EFAULT; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - - uc_flags = frame_uc_flags(regs); - - if (!user_access_begin(frame, sizeof(*frame))) - return -EFAULT; - - /* Create the ucontext. */ - unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); - unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - unsafe_put_user(0, &frame->uc.uc__pad0, Efault); - restorer = ksig->ka.sa.sa_restorer; - unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); - unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); - unsafe_put_sigmask(set, frame, Efault); - user_access_end(); - - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - } - - /* Set up registers for signal handler */ - regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ksig->ka.sa.sa_handler; - - /* We use the x32 calling convention here... */ - regs->di = ksig->sig; - regs->si = (unsigned long) &frame->info; - regs->dx = (unsigned long) &frame->uc; - - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); - - regs->cs = __USER_CS; - regs->ss = __USER_DS; -#endif /* CONFIG_X86_X32_ABI */ - - return 0; -#ifdef CONFIG_X86_X32_ABI -Efault: - user_access_end(); - return -EFAULT; -#endif -} - -/* - * Do a signal return; undo the signal stack. - */ -#ifdef CONFIG_X86_32 -SYSCALL_DEFINE0(sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct sigframe __user *frame; - sigset_t set; - - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || - __get_user(set.sig[1], &frame->extramask[0])) - goto badframe; - - set_current_blocked(&set); - - /* - * x86_32 has no uc_flags bits relevant to restore_sigcontext. - * Save a few cycles by skipping the __get_user. - */ - if (!restore_sigcontext(regs, &frame->sc, 0)) - goto badframe; - return regs->ax; - -badframe: - signal_fault(regs, frame, "sigreturn"); - - return 0; -} -#endif /* CONFIG_X86_32 */ - -SYSCALL_DEFINE0(rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - /* * There are four different struct types for signal frame: sigframe_ia32, * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case @@ -743,43 +201,22 @@ unsigned long get_sigframe_size(void) return max_frame_size; } -static inline int is_ia32_compat_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_IA32_EMULATION) && - ksig->ka.sa.sa_flags & SA_IA32_ABI; -} - -static inline int is_ia32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); -} - -static inline int is_x32_frame(struct ksignal *ksig) -{ - return IS_ENABLED(CONFIG_X86_X32_ABI) && - ksig->ka.sa.sa_flags & SA_X32_ABI; -} - static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = ksig->sig; - sigset_t *set = sigmask_to_save(); - compat_sigset_t *cset = (compat_sigset_t *) set; - /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ksig, cset, regs); + return ia32_setup_rt_frame(ksig, regs); else - return ia32_setup_frame(usig, ksig, cset, regs); + return ia32_setup_frame(ksig, regs); } else if (is_x32_frame(ksig)) { - return x32_setup_rt_frame(ksig, cset, regs); + return x32_setup_rt_frame(ksig, regs); } else { - return __setup_rt_frame(ksig->sig, ksig, set, regs); + return x64_setup_rt_frame(ksig, regs); } } @@ -969,36 +406,3 @@ bool sigaltstack_size_valid(size_t ss_size) return true; } #endif /* CONFIG_DYNAMIC_SIGFRAME */ - -#ifdef CONFIG_X86_X32_ABI -COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) -{ - struct pt_regs *regs = current_pt_regs(); - struct rt_sigframe_x32 __user *frame; - sigset_t set; - unsigned long uc_flags; - - frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) - goto badframe; - if (__get_user(uc_flags, &frame->uc.uc_flags)) - goto badframe; - - set_current_blocked(&set); - - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) - goto badframe; - - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; - - return regs->ax; - -badframe: - signal_fault(regs, frame, "x32 rt_sigreturn"); - return 0; -} -#endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/kernel/signal_32.c index c9c3859322fa..2553136cf39b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/kernel/signal_32.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/arch/x86_64/ia32/ia32_signal.c - * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson @@ -26,7 +24,6 @@ #include <linux/uaccess.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> -#include <asm/ia32_unistd.h> #include <asm/user32.h> #include <uapi/asm/sigcontext.h> #include <asm/proto.h> @@ -35,6 +32,9 @@ #include <asm/sighandling.h> #include <asm/smap.h> +#ifdef CONFIG_IA32_EMULATION +#include <asm/ia32_unistd.h> + static inline void reload_segments(struct sigcontext_32 *sc) { unsigned int cur; @@ -53,6 +53,21 @@ static inline void reload_segments(struct sigcontext_32 *sc) loadsegment(es, sc->es | 0x03); } +#define sigset32_t compat_sigset_t +#define restore_altstack32 compat_restore_altstack +#define unsafe_save_altstack32 unsafe_compat_save_altstack + +#else + +#define sigset32_t sigset_t +#define __NR_ia32_sigreturn __NR_sigreturn +#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn +#define restore_altstack32 restore_altstack +#define unsafe_save_altstack32 unsafe_save_altstack +#define __copy_siginfo_to_user32 copy_siginfo_to_user + +#endif + /* * Do a signal return; undo the signal stack. */ @@ -86,6 +101,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, /* disable syscall checks */ regs->orig_ax = -1; +#ifdef CONFIG_IA32_EMULATION /* * Reload fs and gs if they have changed in the signal * handler. This does not handle long fs/gs base changes in @@ -93,10 +109,17 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, * normal case. */ reload_segments(&sc); +#else + loadsegment(gs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; +#endif + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); } -COMPAT_SYSCALL_DEFINE0(sigreturn) +SYSCALL32_DEFINE0(sigreturn) { struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); @@ -119,7 +142,7 @@ badframe: return 0; } -COMPAT_SYSCALL_DEFINE0(rt_sigreturn) +SYSCALL32_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; @@ -129,7 +152,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; set_current_blocked(&set); @@ -137,7 +160,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; - if (compat_restore_altstack(&frame->uc.uc_stack)) + if (restore_altstack32(&frame->uc.uc_stack)) goto badframe; return regs->ax; @@ -159,9 +182,15 @@ __unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc, struct pt_regs *regs, unsigned int mask) { unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault); +#ifdef CONFIG_IA32_EMULATION unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault); unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault); +#else + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); +#endif unsafe_put_user(regs->di, &sc->di, Efault); unsafe_put_user(regs->si, &sc->si, Efault); @@ -196,43 +225,9 @@ do { \ goto label; \ } while(0) -/* - * Determine which stack to use.. - */ -static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, - size_t frame_size, - void __user **fpstate) -{ - unsigned long sp, fx_aligned, math_size; - - /* Default to using normal stack */ - sp = regs->sp; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ksig->ka.sa.sa_flags & SA_ONSTACK) - sp = sigsp(sp, ksig); - /* This is the legacy signal stack switching. */ - else if (regs->ss != __USER32_DS && - !(ksig->ka.sa.sa_flags & SA_RESTORER) && - ksig->ka.sa.sa_restorer) - sp = (unsigned long) ksig->ka.sa.sa_restorer; - - sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_32 __user *) sp; - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size)) - return (void __user *) -1L; - - sp -= frame_size; - /* Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - sp = ((sp + 4) & -16ul) - 4; - return (void __user *) sp; -} - -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs) +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs) { + sigset32_t *set = (sigset32_t *) sigmask_to_save(); struct sigframe_ia32 __user *frame; void __user *restorer; void __user *fp = NULL; @@ -264,7 +259,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(ksig->sig, &frame->sig, Efault); unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault); unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); @@ -280,15 +275,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ - regs->ax = sig; + regs->ax = ksig->sig; regs->dx = 0; regs->cx = 0; - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->ss = __USER_DS; return 0; Efault: @@ -296,9 +296,9 @@ Efault: return -EFAULT; } -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs) +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { + sigset32_t *set = (sigset32_t *) sigmask_to_save(); struct rt_sigframe_ia32 __user *frame; void __user *restorer; void __user *fp = NULL; @@ -321,7 +321,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(ksig->sig, &frame->sig, Efault); unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault); unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault); @@ -331,7 +331,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else unsafe_put_user(0, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); - unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; @@ -357,15 +357,20 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ - regs->ax = sig; + regs->ax = ksig->sig; regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->ss = __USER_DS; return 0; Efault: diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c new file mode 100644 index 000000000000..ff9c55064223 --- /dev/null +++ b/arch/x86/kernel/signal_64.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> + +#include <asm/ucontext.h> +#include <asm/fpu/signal.h> +#include <asm/sighandling.h> + +#include <asm/syscall.h> +#include <asm/sigframe.h> +#include <asm/signal.h> + +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} + +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) +{ + struct sigcontext sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) + return false; + + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); + + return fpu__restore_sig((void __user *)sc.fpstate, 0); +} + +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); + + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); + + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset_t *set = sigmask_to_save(); + struct rt_sigframe __user *frame; + void __user *fp = NULL; + unsigned long uc_flags; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->di = ksig->sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatibility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ + regs->cs = __USER_CS; + + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +/* + * Do a signal return; undo the signal stack. + */ +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} + +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); + struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; + void __user *restorer; + void __user *fp = NULL; + + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif /* CONFIG_X86_X32_ABI */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d3fdec706f1d..d1e1679f32cf 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -68,13 +68,13 @@ #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/proto.h> #endif +#include <asm/proto.h> + DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cc..a78e73da4a74 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,7 +51,7 @@ int tsc_clocksource_reliable; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; -struct clocksource *art_related_clocksource; +static struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index b63cf8f7745e..6c07f6daaa22 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } +setup: auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 62bc7a01cecc..c92c49a0b35b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_scattered(CPUID_12_EAX, - SF(SGX1) | SF(SGX2) + SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); kvm_cpu_cap_mask(CPUID_8000_0001_ECX, @@ -1047,9 +1047,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is * expected to derive it from supported XCR0. */ - entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | - SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | - SGX_ATTR_KSS; + entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; entry->ebx &= 0; break; /* Intel PT */ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1ccb769f62af..b6f96d47e596 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2443,6 +2443,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, { bool list_unstable, zapped_root = false; + lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); @@ -4262,14 +4263,14 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; - r = make_mmu_pages_available(vcpu); - if (r) - goto out_unlock; - - if (is_tdp_mmu_fault) + if (is_tdp_mmu_fault) { r = kvm_tdp_mmu_map(vcpu, fault); - else + } else { + r = make_mmu_pages_available(vcpu); + if (r) + goto out_unlock; r = __direct_map(vcpu, fault); + } out_unlock: if (is_tdp_mmu_fault) diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index a19d473d0184..4e5b8444f161 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -23,6 +23,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ #define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) +#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) struct cpuid_reg { u32 function; @@ -78,6 +79,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX1; else if (x86_feature == X86_FEATURE_SGX2) return KVM_X86_FEATURE_SGX2; + else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) + return KVM_X86_FEATURE_SGX_EDECCSSA; return x86_feature; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 4c620999d230..995bc0f90759 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1091,6 +1091,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN)) + return; + + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN); } @@ -1125,6 +1131,9 @@ void svm_free_nested(struct vcpu_svm *svm) if (!svm->nested.initialized) return; + if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr)) + svm_switch_vmcb(svm, &svm->vmcb01); + svm_vcpu_free_msrpm(svm->nested.msrpm); svm->nested.msrpm = NULL; @@ -1143,9 +1152,6 @@ void svm_free_nested(struct vcpu_svm *svm) svm->nested.initialized = false; } -/* - * Forcibly leave nested mode in order to be able to reset the VCPU later on. - */ void svm_leave_nested(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f88c8e6766e..ce362e88a567 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -346,12 +346,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return 0; } -static int is_external_interrupt(u32 info) -{ - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); -} - static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1438,6 +1432,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) */ svm_clear_current_vmcb(svm->vmcb); + svm_leave_nested(vcpu); svm_free_nested(svm); sev_free_vcpu(vcpu); @@ -2709,9 +2704,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data = 0; switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; case MSR_IA32_PERF_CAPABILITIES: return 0; @@ -2812,7 +2807,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -3041,7 +3036,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_F10H_DECFG: { + case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; msr_entry.index = msr->index; @@ -3425,15 +3420,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 0; } - if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && - exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) - printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " - "exit_code 0x%x\n", - __func__, svm->vmcb->control.exit_int_info, - exit_code); - if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c62352dda6a..5b0d4859e4b7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4854,6 +4854,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) { + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); } @@ -6440,9 +6441,6 @@ out: return kvm_state.size; } -/* - * Forcibly leave nested mode in order to be able to reset the VCPU later on. - */ void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecea83f0da49..69227f77b201 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -628,6 +628,12 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } +/* Forcibly leave the nested mode in cases like a vCPU reset */ +static void kvm_leave_nested(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops.nested_ops->leave_nested(vcpu); +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) @@ -1557,7 +1563,7 @@ static const u32 msr_based_features_all[] = { MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, @@ -5195,7 +5201,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SMM) { if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { - kvm_x86_ops.nested_ops->leave_nested(vcpu); + kvm_leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); } @@ -9805,7 +9811,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) int kvm_check_nested_events(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { kvm_x86_ops.nested_ops->triple_fault(vcpu); return 1; } @@ -10560,10 +10566,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - if (is_guest_mode(vcpu)) { + if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (is_guest_mode(vcpu)) kvm_x86_ops.nested_ops->triple_fault(vcpu); - } else { + + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; r = 0; @@ -11997,8 +12004,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) WARN_ON_ONCE(!init_event && (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); + /* + * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's + * possible to INIT the vCPU while L2 is active. Force the vCPU back + * into L1 as EFER.SVME is cleared on INIT (along with all other EFER + * bits), i.e. virtualization is disabled. + */ + if (is_guest_mode(vcpu)) + kvm_leave_nested(vcpu); + kvm_lapic_reset(vcpu, init_event); + WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); vcpu->arch.hflags = 0; vcpu->arch.smi_pending = 0; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2dae413bd62a..f3098c0e386a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -954,6 +954,14 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); } +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return EVTCHN_2L_NR_CHANNELS; + else + return COMPAT_EVTCHN_2L_NR_CHANNELS; +} + static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, evtchn_port_t *ports) { @@ -1042,6 +1050,10 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = -EFAULT; goto out; } + if (ports[i] >= max_evtchn_port(vcpu->kvm)) { + *r = -EINVAL; + goto out; + } } if (sched_poll.nr_ports == 1) @@ -1215,6 +1227,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) bool longmode; u64 input, params[6], r = -ENOSYS; bool handled = false; + u8 cpl; input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -1242,9 +1255,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) params[5] = (u64)kvm_r9_read(vcpu); } #endif + cpl = static_call(kvm_x86_get_cpl)(vcpu); trace_kvm_xen_hypercall(input, params[0], params[1], params[2], params[3], params[4], params[5]); + /* + * Only allow hypercall acceleration for CPL0. The rare hypercalls that + * are permitted in guest userspace can be handled by the VMM. + */ + if (unlikely(cpl > 0)) + goto handle_in_userspace; + switch (input) { case __HYPERVISOR_xen_version: if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { @@ -1279,10 +1300,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) if (handled) return kvm_xen_hypercall_set_result(vcpu, r); +handle_in_userspace: vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; - vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu); + vcpu->run->xen.u.hcall.cpl = cpl; vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; @@ -1297,14 +1319,6 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } -static inline int max_evtchn_port(struct kvm *kvm) -{ - if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) - return EVTCHN_2L_NR_CHANNELS; - else - return COMPAT_EVTCHN_2L_NR_CHANNELS; -} - static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) { int poll_evtchn = vcpu->arch.xen.poll_evtchn; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 78c5bc654cff..6453fbaedb08 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -217,9 +217,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PHYSICAL_PAGE_MASK; + phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; + /* + * Mask out any bits not part of the actual physical + * address, like memory encryption bits. + */ + phys_addr &= PHYSICAL_PAGE_MASK; + retval = memtype_reserve(phys_addr, (u64)phys_addr + size, pcm, &new_pcm); if (retval) { diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 2e5a045731de..ef34ba21aa92 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> +#include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -330,6 +331,23 @@ void arch_invalidate_pmem(void *addr, size_t size) EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif +#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION +bool cpu_cache_has_invalidate_memregion(void) +{ + return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); + +int cpu_cache_invalidate_memregion(int res_desc) +{ + if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) + return -ENXIO; + wbinvd_on_all_cpus(); + return 0; +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); +#endif + static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 00127abd89ee..99620428ad78 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,7 +11,6 @@ #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> -#include <linux/init.h> #include <asm/extable.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> @@ -389,18 +388,6 @@ out: return ret; } -int __init bpf_arch_init_dispatcher_early(void *ip) -{ - const u8 *nop_insn = x86_nops[5]; - - if (is_endbr(*(u32 *)ip)) - ip += ENDBR_INSN_SIZE; - - if (memcmp(ip, nop_insn, X86_PATCH_SIZE)) - text_poke_early(ip, nop_insn, X86_PATCH_SIZE); - return 0; -} - int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 994a229cb79f..68244a3422d1 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -183,13 +183,12 @@ err_sysfs: return r; } -static int xo15_sci_remove(struct acpi_device *device) +static void xo15_sci_remove(struct acpi_device *device) { acpi_disable_gpe(NULL, xo15_sci_gpe); acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); cancel_work_sync(&sci_work); sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr); - return 0; } #ifdef CONFIG_PM_SLEEP diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index bb176c72891c..93ae33248f42 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -513,15 +513,23 @@ static int pm_cpu_check(const struct x86_cpu_id *c) static void pm_save_spec_msr(void) { - u32 spec_msr_id[] = { - MSR_IA32_SPEC_CTRL, - MSR_IA32_TSX_CTRL, - MSR_TSX_FORCE_ABORT, - MSR_IA32_MCU_OPT_CTRL, - MSR_AMD64_LS_CFG, + struct msr_enumeration { + u32 msr_no; + u32 feature; + } msr_enum[] = { + { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL }, + { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL }, + { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT }, + { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL }, + { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD }, + { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC }, }; + int i; - msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id)); + for (i = 0; i < ARRAY_SIZE(msr_enum); i++) { + if (boot_cpu_has(msr_enum[i].feature)) + msr_build_context(&msr_enum[i].msr_no, 1); + } } static int pm_check_save_msr(void) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 745420853a7c..1a2ba31635a5 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -23,6 +23,7 @@ #include <linux/start_kernel.h> #include <linux/sched.h> #include <linux/kprobes.h> +#include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/export.h> #include <linux/mm.h> @@ -113,7 +114,7 @@ static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); static int __init parse_xen_msr_safe(char *str) { if (str) - return strtobool(str, &xen_msr_safe); + return kstrtobool(str, &xen_msr_safe); return -EINVAL; } early_param("xen_msr_safe", parse_xen_msr_safe); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4f4309500559..8db26f10fb1d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/sched.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/pm.h> #include <linux/memblock.h> @@ -85,7 +86,7 @@ static void __init xen_parse_512gb(void) arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); if (!arg) val = true; - else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val)) return; xen_512gb_limit = val; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c3e1f9a7d43a..4b0d6fff88de 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) void xen_smp_intr_free(unsigned int cpu) { + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); per_cpu(xen_resched_irq, cpu).irq = -1; - kfree(per_cpu(xen_resched_irq, cpu).name); - per_cpu(xen_resched_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); per_cpu(xen_callfunc_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfunc_irq, cpu).name); - per_cpu(xen_callfunc_irq, cpu).name = NULL; } + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; if (per_cpu(xen_debug_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); per_cpu(xen_debug_irq, cpu).irq = -1; - kfree(per_cpu(xen_debug_irq, cpu).name); - per_cpu(xen_debug_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, NULL); per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); - per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } } @@ -65,6 +65,7 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, @@ -74,9 +75,9 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_resched_irq, cpu).irq = rc; - per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, @@ -86,10 +87,10 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfunc_irq, cpu).irq = rc; - per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, IRQF_PERCPU | IRQF_NOBALANCING, @@ -97,10 +98,10 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, @@ -110,7 +111,6 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; - per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; return 0; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 480be82e9b7b..6175f2c5c822 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -97,18 +97,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void) void xen_smp_intr_free_pv(unsigned int cpu) { + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; if (per_cpu(xen_irq_work, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; } + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; } } @@ -118,6 +118,7 @@ int xen_smp_intr_init_pv(unsigned int cpu) char *callfunc_name, *pmu_name; callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + per_cpu(xen_irq_work, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, @@ -127,10 +128,10 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; if (is_xen_pmu) { pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + per_cpu(xen_pmu_irq, cpu).name = pmu_name; rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, xen_pmu_irq_handler, IRQF_PERCPU|IRQF_NOBALANCING, @@ -138,7 +139,6 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; } return 0; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 043c73dfd2c9..5c6fc16e4b92 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu) cpu, per_cpu(lock_kicker_irq, cpu)); name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + per_cpu(irq_name, cpu) = name; irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, @@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; - per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu) if (!xen_pvspin) return; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; /* * When booting the kernel with 'mitigations=auto,nosmt', the secondary * CPUs are not activated, and lock_kicker_irq is not initialized. @@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu) unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; - kfree(per_cpu(irq_name, cpu)); - per_cpu(irq_name, cpu) = NULL; } PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6b4fdf6b9542..4a184f6e4e4d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe @@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe |