diff options
Diffstat (limited to 'arch/x86/xen/enlighten_pv.c')
| -rw-r--r-- | arch/x86/xen/enlighten_pv.c | 226 |
1 files changed, 163 insertions, 63 deletions
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 93b658248d01..4806cc28d7ca 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -49,6 +49,7 @@ #include <xen/hvc-console.h> #include <xen/acpi.h> +#include <asm/cpuid/api.h> #include <asm/paravirt.h> #include <asm/apic.h> #include <asm/page.h> @@ -60,6 +61,7 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> @@ -72,6 +74,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#include <asm/irq_stack.h> #ifdef CONFIG_X86_IOPL_IOPERM #include <asm/io_bitmap.h> #endif @@ -79,16 +82,12 @@ #ifdef CONFIG_ACPI #include <linux/acpi.h> #include <asm/acpi.h> -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <acpi/processor.h> #include <xen/interface/platform.h> #endif #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "multicalls.h" -#include "pmu.h" #include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ @@ -97,10 +96,58 @@ void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} + +#else + +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } + +#endif + struct tls_descs { struct desc_struct desc[3]; }; +DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE; + +enum xen_lazy_mode xen_get_lazy_mode(void) +{ + if (in_interrupt()) + return XEN_LAZY_NONE; + + return this_cpu_read(xen_lazy_mode); +} + /* * Updating the 3 TLS descriptors in the GDT on every task switch is * surprisingly expensive so we avoid updating them if they haven't @@ -164,7 +211,7 @@ static void __init xen_set_mtrr_data(void) /* Only overwrite MTRR state if any MTRR could be got from Xen. */ if (reg) - mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); + guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); #endif } @@ -188,7 +235,10 @@ static void __init xen_pv_init_platform(void) if (xen_initial_domain()) xen_set_mtrr_data(); else - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); + + /* Adjust nr_cpu_ids before "enumeration" happens */ + xen_smp_count_cpus(); } static void __init xen_pv_guest_late_init(void) @@ -205,14 +255,22 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { - unsigned maskebx = ~0; + unsigned int maskebx = ~0; + unsigned int or_ebx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ switch (*ax) { - case CPUID_MWAIT_LEAF: + case 0x1: + /* Replace initial APIC ID in bits 24-31 of EBX. */ + /* See xen_pv_smp_config() for related topology preparations. */ + maskebx = 0x00ffffff; + or_ebx = smp_processor_id() << 24; + break; + + case CPUID_LEAF_MWAIT: /* Synthesize the values.. */ *ax = 0; *bx = 0; @@ -234,6 +292,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, : "0" (*ax), "2" (*cx)); *bx &= maskebx; + *bx |= or_ebx; } static bool __init xen_check_mwait(void) @@ -281,24 +340,24 @@ static bool __init xen_check_mwait(void) * ecx and edx. The hypercall provides only partial information. */ - ax = CPUID_MWAIT_LEAF; + ax = CPUID_LEAF_MWAIT; bx = 0; cx = 0; dx = 0; native_cpuid(&ax, &bx, &cx, &dx); - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so, * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. */ buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP); set_xen_guest_handle(op.u.set_pminfo.pdc, buf); if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; } @@ -323,7 +382,6 @@ static bool __init xen_check_xsave(void) static void __init xen_init_capabilities(void) { - setup_force_cpu_cap(X86_FEATURE_XENPV); setup_clear_cpu_cap(X86_FEATURE_DCA); setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); setup_clear_cpu_cap(X86_FEATURE_MTRR); @@ -362,10 +420,25 @@ static noinstr unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } +static void xen_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(XEN_LAZY_CPU); +} + static void xen_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + xen_mc_flush(); - paravirt_end_context_switch(next); + leave_lazy(XEN_LAZY_CPU); + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } static unsigned long xen_store_tr(void) @@ -472,7 +545,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gdt(const struct desc_ptr *dtr) @@ -523,7 +596,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - pfn = virt_to_pfn(va); + pfn = virt_to_pfn((void *)va); mfn = pfn_to_mfn(pfn); pte = pfn_pte(pfn, PAGE_KERNEL_RO); @@ -568,7 +641,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * exception between the new %fs descriptor being loaded and * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (xen_get_lazy_mode() == XEN_LAZY_CPU) loadsegment(fs, 0); xen_mc_batch(); @@ -577,7 +650,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 1); load_TLS_descriptor(t, cpu, 2); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gs_index(unsigned int idx) @@ -652,6 +725,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) } #endif +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + bool inhcall; + + instrumentation_begin(); + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + irqentry_exit_cond_resched(); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + instrumentation_end(); + irqentry_exit(regs, state); + } +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -678,7 +781,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), @@ -694,7 +797,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), TRAP_ENTRY(exc_simd_coprocessor_error, false ), -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET TRAP_ENTRY(exc_control_protection, false ), #endif }; @@ -909,7 +1012,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } @@ -973,7 +1076,7 @@ static void xen_write_cr0(unsigned long cr0) MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_write_cr4(unsigned long cr4) @@ -983,37 +1086,33 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static u64 xen_do_read_msr(unsigned int msr, int *err) +static u64 xen_do_read_msr(u32 msr, int *err) { u64 val = 0; /* Avoid uninitialized value for safe variant. */ - if (pmu_msr_read(msr, &val, err)) + if (pmu_msr_chk_emulated(msr, &val, true)) return val; if (err) - val = native_read_msr_safe(msr, err); + *err = native_read_msr_safe(msr, &val); else val = native_read_msr(msr); switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; + if (smp_processor_id() == 0) + val |= MSR_IA32_APICBASE_BSP; + else + val &= ~MSR_IA32_APICBASE_BSP; break; } return val; } -static void set_seg(unsigned int which, unsigned int low, unsigned int high, - int *err) +static void set_seg(u32 which, u64 base) { - u64 base = ((u64)high << 32) | low; - - if (HYPERVISOR_set_segment_base(which, base) == 0) - return; - - if (err) - *err = -EIO; - else + if (HYPERVISOR_set_segment_base(which, base)) WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); } @@ -1022,20 +1121,19 @@ static void set_seg(unsigned int which, unsigned int low, unsigned int high, * With err == NULL write_msr() semantics are selected. * Supplying an err pointer requires err to be pre-initialized with 0. */ -static void xen_do_write_msr(unsigned int msr, unsigned int low, - unsigned int high, int *err) +static void xen_do_write_msr(u32 msr, u64 val, int *err) { switch (msr) { case MSR_FS_BASE: - set_seg(SEGBASE_FS, low, high, err); + set_seg(SEGBASE_FS, val); break; case MSR_KERNEL_GS_BASE: - set_seg(SEGBASE_GS_USER, low, high, err); + set_seg(SEGBASE_GS_USER, val); break; case MSR_GS_BASE: - set_seg(SEGBASE_GS_KERNEL, low, high, err); + set_seg(SEGBASE_GS_KERNEL, val); break; case MSR_STAR: @@ -1051,42 +1149,45 @@ static void xen_do_write_msr(unsigned int msr, unsigned int low, break; default: - if (!pmu_msr_write(msr, low, high, err)) { - if (err) - *err = native_write_msr_safe(msr, low, high); - else - native_write_msr(msr, low, high); - } + if (pmu_msr_chk_emulated(msr, &val, false)) + return; + + if (err) + *err = native_write_msr_safe(msr, val); + else + native_write_msr(msr, val); } } -static u64 xen_read_msr_safe(unsigned int msr, int *err) +static int xen_read_msr_safe(u32 msr, u64 *val) { - return xen_do_read_msr(msr, err); + int err = 0; + + *val = xen_do_read_msr(msr, &err); + return err; } -static int xen_write_msr_safe(unsigned int msr, unsigned int low, - unsigned int high) +static int xen_write_msr_safe(u32 msr, u64 val) { int err = 0; - xen_do_write_msr(msr, low, high, &err); + xen_do_write_msr(msr, val, &err); return err; } -static u64 xen_read_msr(unsigned int msr) +static u64 xen_read_msr(u32 msr) { - int err; + int err = 0; return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); } -static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +static void xen_write_msr(u32 msr, u64 val) { int err; - xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL); + xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL); } /* This is called once we have the cpu_possible_mask */ @@ -1123,8 +1224,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, - .wbinvd = pv_native_wbinvd, - .read_msr = xen_read_msr, .write_msr = xen_write_msr, @@ -1156,7 +1255,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, + .start_context_switch = xen_start_context_switch, .end_context_switch = xen_end_context_switch, }, }; @@ -1302,7 +1401,11 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) JMP32_INSN_SIZE); xen_domain_type = XEN_PV_DOMAIN; + setup_force_cpu_cap(X86_FEATURE_XENPV); xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1326,7 +1429,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; - x86_init.irqs.intr_mode_init = x86_init_noop; + x86_init.irqs.intr_mode_init = x86_64_probe_apic; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; x86_init.hyper.init_platform = xen_pv_init_platform; @@ -1366,12 +1469,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_init_capabilities(); -#ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. */ xen_init_apic(); -#endif machine_ops = xen_machine_ops; @@ -1395,7 +1496,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, |
