diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/apic.c | 3 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 119 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 239 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 186 | ||||
-rw-r--r-- | arch/x86/xen/pci-swiotlb-xen.c | 3 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 50 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 56 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 3 |
10 files changed, 536 insertions, 126 deletions
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index ec57bd3818a4..7005ced5d1ad 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -6,8 +6,9 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> +#include "xen-ops.h" -unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { struct physdev_apic apic_op; int ret; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821e..47b3acdc2ac5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -38,6 +38,7 @@ #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> #include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> @@ -79,6 +80,8 @@ #include "smp.h" #include "multicalls.h" +#include <xen/events.h> + EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); @@ -107,7 +110,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -124,6 +127,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -341,9 +357,7 @@ static void __init xen_init_cpuid_mask(void) unsigned int xsave_mask; cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_MTRR) | /* disable MTRR */ + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) @@ -540,12 +554,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG(); } +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_table(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); } @@ -627,8 +657,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, /* * Look for known traps using IST, and substitute them * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault and - * machine_check, so we should never see them. Warn if + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if * there's an unexpected IST-using fault handler. */ if (addr == (unsigned long)debug) @@ -643,7 +673,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, return 0; #ifdef CONFIG_X86_MCE } else if (addr == (unsigned long)machine_check) { - return 0; + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; #endif } else { /* Some other trap using IST? */ @@ -1124,9 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, @@ -1258,7 +1290,6 @@ asmlinkage void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; - pgd_t *pgd; if (!xen_start_info) return; @@ -1350,8 +1381,6 @@ asmlinkage void __init xen_start_kernel(void) acpi_numa = -1; #endif - pgd = (pgd_t *)xen_start_info->pt_base; - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1360,7 +1389,7 @@ asmlinkage void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); @@ -1439,32 +1468,6 @@ asmlinkage void __init xen_start_kernel(void) #endif } -static int init_hvm_pv_info(int *major, int *minor) -{ - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; - - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - *major = eax >> 16; - *minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - - cpuid(base + 2, &pages, &msr, &ecx, &edx); - - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - xen_setup_features(); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; - - return 0; -} - void __ref xen_hvm_init_shared_info(void) { int cpu; @@ -1497,6 +1500,31 @@ void __ref xen_hvm_init_shared_info(void) } #ifdef CONFIG_XEN_PVHVM +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info.name = "Xen HVM"; + + xen_domain_type = XEN_HVM_DOMAIN; +} + static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1519,12 +1547,7 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { static void __init xen_hvm_guest_init(void) { - int r; - int major, minor; - - r = init_hvm_pv_info(&major, &minor); - if (r < 0) - return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631ce..5a16824cc2b3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,6 +84,7 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); +#ifdef CONFIG_X86_32 /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -91,7 +92,7 @@ DEFINE_SPINLOCK(xen_reservation_lock); */ #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); - +#endif #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -308,8 +309,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { - if (!xen_batched_set_pte(ptep, pteval)) - native_set_pte(ptep, pteval); + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } } static void xen_set_pte(pte_t *ptep, pte_t pteval) @@ -1162,9 +1175,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(pgd_t *base) -{ -} +static void xen_post_allocator_init(void); static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) { @@ -1180,14 +1191,87 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) } } -static void xen_post_allocator_init(void); +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); -static void __init xen_pagetable_setup_done(pgd_t *base) + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} +#endif +static void __init xen_pagetable_init(void) { +#ifdef CONFIG_X86_64 + unsigned long size; + unsigned long addr; +#endif + paging_init(); xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long new_mfn_list; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* On 32-bit, we get zero so this never gets executed. */ + new_mfn_list = xen_revector_p2m_tree(); + if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + } else + goto skip; + } + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superflous and is not neccessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +skip: +#endif xen_post_allocator_init(); } - static void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -1244,7 +1328,8 @@ static void xen_flush_tlb_single(unsigned long addr) } static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { struct { struct mmuext_op op; @@ -1256,7 +1341,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_others(cpus, mm, va); + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1269,11 +1354,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; + args->op.arg1.linear_addr = start; } MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); @@ -1416,13 +1500,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) } #endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { - pte = mask_rw_pte(ptep, pte); + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); - xen_set_pte(ptep, pte); + native_set_pte(ptep, pte); } static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) @@ -1628,7 +1727,7 @@ static void set_page_prot(void *addr, pgprot_t prot) if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } - +#ifdef CONFIG_X86_32 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1679,7 +1778,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } - +#endif void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; @@ -1707,7 +1806,20 @@ static void convert_pfn_mfn(void *v) for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } - +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_end)--; + } +} /* * Set up the initial kernel pagetable. * @@ -1719,11 +1831,13 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; /* max_pfn_mapped is the last pfn mapped in the initial memory * mappings. Considering that on Xen after the kernel mappings we @@ -1731,32 +1845,53 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, * set max_pfn_mapped to the last real pfn mapped. */ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][511] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ + copy_page(level2_ident_pgt, l2); + /* Graft it onto L4[511][511] */ + copy_page(level2_kernel_pgt, l2); + + /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); + copy_page(level2_fixmap_pgt, l2); + /* Note that we don't do anything with level1_fixmap_pgt which + * we don't need. */ /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); @@ -1767,22 +1902,28 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Switch over */ - pgd = init_level4_pgt; - /* * At this stage there can be no user pgd, and no page * structure to attach it to, so make sure we just set kernel * pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); + __xen_write_cr3(true, __pa(init_level4_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); - return pgd; + /* Our (by three pages) smaller Xen pagetable that we are using */ + memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); @@ -1807,8 +1948,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) */ swapper_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - memcpy(swapper_kernel_pmd, initial_kernel_pmd, - sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); swapper_pg_dir[KERNEL_PGD_BOUNDARY] = __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); @@ -1825,8 +1965,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1838,11 +1977,11 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + copy_page(initial_page_table, pgd); initial_page_table[KERNEL_PGD_BOUNDARY] = __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); @@ -1858,8 +1997,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return initial_page_table; } #endif /* CONFIG_X86_64 */ @@ -2041,8 +2178,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; - x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2310,6 +2446,9 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EINVAL; + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == @@ -2328,8 +2467,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = -EFAULT; - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) goto out; nr -= batch; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 64effdc6da94..b5e4d302a067 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -22,7 +22,7 @@ * * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. + * 512 and 1024 entries respectively. * * In short, these structures contain the Machine Frame Number (MFN) of the PFN. * @@ -139,11 +139,11 @@ * / | ~0, ~0, .... | * | \---------------/ * | - * p2m_missing p2m_missing - * /------------------\ /------------\ - * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | - * | [p2m_mid_missing]+---->| ..., ~0 | - * \------------------/ \------------/ + * p2m_mid_missing p2m_missing + * /-----------------\ /------------\ + * | [p2m_missing] +---->| ~0, ~0, ~0 | + * | [p2m_missing] +---->| ..., ~0 | + * \-----------------/ \------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -194,6 +194,13 @@ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID * boundary violation will require three middle nodes. */ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); +/* When we populate back during bootup, the amount of pages can vary. The + * max we have is seen is 395979, but that does not mean it can't be more. + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -389,7 +396,85 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } +#ifdef CONFIG_X86_64 +#include <linux/bootmem.h> +unsigned long __init xen_revector_p2m_tree(void) +{ + unsigned long va_start; + unsigned long va_end; + unsigned long pfn; + unsigned long pfn_free = 0; + unsigned long *mfn_list = NULL; + unsigned long size; + + va_start = xen_start_info->mfn_list; + /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), + * so make sure it is rounded up to that */ + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + va_end = va_start + size; + + /* If we were revectored already, don't do it again. */ + if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) + return 0; + + mfn_list = alloc_bootmem_align(size, PAGE_SIZE); + if (!mfn_list) { + pr_warn("Could not allocate space for a new P2M tree!\n"); + return xen_start_info->mfn_list; + } + /* Fill it out with INVALID_P2M_ENTRY value */ + memset(mfn_list, 0xFF, size); + + for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx; + unsigned long *mid_p; + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + mid_p = p2m_top[topidx][mididx]; + if (!mid_p) + continue; + if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) + continue; + + if ((unsigned long)mid_p == INVALID_P2M_ENTRY) + continue; + /* The old va. Rebase it on mfn_list */ + if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { + unsigned long *new; + + if (pfn_free > (size / sizeof(unsigned long))) { + WARN(1, "Only allocated for %ld pages, but we want %ld!\n", + size / sizeof(unsigned long), pfn_free); + return 0; + } + new = &mfn_list[pfn_free]; + + copy_page(new, mid_p); + p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + + pfn_free += P2M_PER_PAGE; + + } + /* This should be the leafs allocated for identity from _brk. */ + } + return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ + return 0; +} +#endif unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -423,7 +508,7 @@ static void free_p2m_page(void *p) free_page((unsigned long)p); } -/* +/* * Fully allocate the p2m structure for a given pfn. We need to check * that both the top and mid levels are allocated, and make sure the * parallel mfn tree is kept in sync. We may race with other cpus, so @@ -570,12 +655,99 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!early_alloc_p2m(pfn)) return false; + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) return false; diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 1608244756de..969570491c39 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,11 +8,14 @@ #include <xen/xen.h> #include <asm/iommu_table.h> + +#include <asm/xen/swiotlb-xen.h> #ifdef CONFIG_X86_64 #include <asm/iommu.h> #include <asm/dma.h> #endif #include <linux/export.h> + int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index ffcf2615640b..0a7852483ffe 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <xen/platform_pci.h> +#include "xen-ops.h" #define XEN_PLATFORM_ERR_MAGIC -1 #define XEN_PLATFORM_ERR_PROTOCOL -2 diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a4790bf22c59..3edb320d508f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); - for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } } static unsigned long __init xen_do_chunk(unsigned long start, @@ -157,25 +164,24 @@ static unsigned long __init xen_populate_chunk( unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { - unsigned long credits = credits_left; unsigned long s_pfn; unsigned long e_pfn; unsigned long pfns; long capacity; - if (credits <= 0) + if (credits_left <= 0) break; if (entry->type != E820_RAM) continue; - e_pfn = PFN_UP(entry->addr + entry->size); + e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after the xen_start_info->nr_pages */ if (e_pfn <= max_pfn) continue; - s_pfn = PFN_DOWN(entry->addr); + s_pfn = PFN_UP(entry->addr); /* If the E820 falls within the nr_pages, we want to start * at the nr_pages PFN. * If that would mean going past the E820 entry, skip it @@ -184,23 +190,19 @@ static unsigned long __init xen_populate_chunk( capacity = e_pfn - max_pfn; dest_pfn = max_pfn; } else { - /* last_pfn MUST be within E820_RAM regions */ - if (*last_pfn && e_pfn >= *last_pfn) - s_pfn = *last_pfn; capacity = e_pfn - s_pfn; dest_pfn = s_pfn; } - /* If we had filled this E820_RAM entry, go to the next one. */ - if (capacity <= 0) - continue; - if (credits > capacity) - credits = capacity; + if (credits_left < capacity) + capacity = credits_left; - pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); + pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); done += pfns; - credits_left -= pfns; *last_pfn = (dest_pfn + pfns); + if (pfns < capacity) + break; + credits_left -= pfns; } return done; } @@ -429,6 +431,24 @@ char * __init xen_memory_setup(void) * - mfn_list * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. */ memblock_reserve(__pa(xen_start_info->mfn_list), xen_start_info->pt_base - xen_start_info->mfn_list); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6b..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index aaa7291c9259..7faed5869e5b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,61 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE + .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE +#define NEXT_HYPERCALL(x) \ + ENTRY(xen_hypercall_##x) \ + .skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) + .skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) + .balign PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c150154..bb5a8105ea86 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -27,7 +27,7 @@ void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; @@ -45,6 +45,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); |