From bbc03778b9954a2ec93baed63718e4df0192f130 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 20 Jul 2015 16:01:53 -0700 Subject: x86/mm: Add parenthesis for TLB tracepoint size calculation flush_tlb_info->flush_start/end are both normal virtual addresses. When calculating 'nr_pages' (only used for the tracepoint), I neglected to put parenthesis in. Thanks to David Koufaty for pointing this out. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Cc: Link: http://lkml.kernel.org/r/20150720230153.9E834081@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3250f2371aea..90b924acd982 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -117,7 +117,7 @@ static void flush_tlb_func(void *info) } else { unsigned long addr; unsigned long nr_pages = - f->flush_end - f->flush_start / PAGE_SIZE; + (f->flush_end - f->flush_start) / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); -- cgit From a89652769470d12cd484ee3d3f7bde0742be8d96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 20 Jul 2015 14:29:58 -0700 Subject: x86/mpx: Do not set ->vm_ops on MPX VMAs MPX setups private anonymous mapping, but uses vma->vm_ops too. This can confuse core VM, as it relies on vm->vm_ops to distinguish file VMAs from anonymous. As result we will get SIGBUS, because handle_pte_fault() thinks it's file VMA without vm_ops->fault and it doesn't know how to handle the situation properly. Let's fix that by not setting ->vm_ops. We don't really need ->vm_ops here: MPX VMA can be detected with VM_MPX flag. And vma_merge() will not merge MPX VMA with non-MPX VMA, because ->vm_flags won't match. The only thing left is name of VMA. I'm not sure if it's part of ABI, or we can just drop it. The patch keep it by providing arch_vma_name() on x86. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Cc: # Fixes: 6b7339f4 (mm: avoid setting up anonymous pages into file mapping) Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Link: http://lkml.kernel.org/r/20150720212958.305CC3E9@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 7 +++++++ arch/x86/mm/mpx.c | 24 +++--------------------- 2 files changed, 10 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 9d518d693b4b..844b06d67df4 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -126,3 +126,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_MPX) + return "[mpx]"; + return NULL; +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 7a657f58bbea..db1b0bc5017c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -20,20 +20,6 @@ #define CREATE_TRACE_POINTS #include -static const char *mpx_mapping_name(struct vm_area_struct *vma) -{ - return "[mpx]"; -} - -static struct vm_operations_struct mpx_vma_ops = { - .name = mpx_mapping_name, -}; - -static int is_mpx_vma(struct vm_area_struct *vma) -{ - return (vma->vm_ops == &mpx_vma_ops); -} - static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) { if (is_64bit_mm(mm)) @@ -53,9 +39,6 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) /* * This is really a simplified "vm_mmap". it only handles MPX * bounds tables (the bounds directory is user-allocated). - * - * Later on, we use the vma->vm_ops to uniquely identify these - * VMAs. */ static unsigned long mpx_mmap(unsigned long len) { @@ -101,7 +84,6 @@ static unsigned long mpx_mmap(unsigned long len) ret = -ENOMEM; goto out; } - vma->vm_ops = &mpx_vma_ops; if (vm_flags & VM_LOCKED) { up_write(&mm->mmap_sem); @@ -812,7 +794,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, * so stop immediately and return an error. This * probably results in a SIGSEGV. */ - if (!is_mpx_vma(vma)) + if (!(vma->vm_flags & VM_MPX)) return -EINVAL; len = min(vma->vm_end, end) - addr; @@ -945,9 +927,9 @@ static int try_unmap_single_bt(struct mm_struct *mm, * lots of tables even though we have no actual table * entries in use. */ - while (next && is_mpx_vma(next)) + while (next && (next->vm_flags & VM_MPX)) next = next->vm_next; - while (prev && is_mpx_vma(prev)) + while (prev && (prev->vm_flags & VM_MPX)) prev = prev->vm_prev; /* * We know 'start' and 'end' lie within an area controlled -- cgit From 5bc016f1abaa1c5ac0e3af23aa79faec4634a074 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:49:01 +0100 Subject: x86/fpu: Disable dependent CPU features on "noxsave" Complete the set of dependent features that need disabling at once: XSAVEC, AVX-512 and all currently known to the kernel extensions to it, as well as MPX need to be disabled too. Signed-off-by: Jan Beulich Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55ACC40D0200007800092E6C@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0b39173dd971..1e173f6285c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -351,9 +351,15 @@ static int __init x86_noxsave_setup(char *s) setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVEC); setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_MPX); return 1; } -- cgit From ca1fec58bc6a90be96a59b4769e951156846c6ca Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:46:14 +0100 Subject: x86/mm/pat: Adjust default caching mode translation tables Make WT really mean WT (rather than UC). I can't see why commit 9cd25aac1f ("x86/mm/pat: Emulate PAT when it is disabled") didn't make this to match its changes to pat_init(). Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/55ACC3660200007800092E62@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8533b46e6bee..7a4532229f51 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -- cgit From 1c9cf9b211030a454a84cbc1cb15b82d9aa49011 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:14 -0600 Subject: x86/mm: Move warning from __ioremap_check_ram() to the call site __ioremap_check_ram() has a WARN_ONCE() which is emitted when the given pfn range is not RAM. The warning is bogus in two aspects: - it never triggers since walk_system_ram_range() only calls __ioremap_check_ram() for RAM ranges. - the warning message is wrong as it says: "ioremap on RAM' after it established that the pfn range is not RAM. Move the WARN_ONCE() to __ioremap_caller(), and update the message to include the address range so we get an actual warning when something tries to ioremap system RAM. [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-2-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index cc5ccc415cc0..fd3df0ddb4d4 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -63,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, !PageReserved(pfn_to_page(start_pfn + i))) return 1; - WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); - return 0; } @@ -131,8 +129,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) + __ioremap_check_ram) == 1) { + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + phys_addr, last_addr); return NULL; + } } /* * Mappings have to be page-aligned -- cgit From 9a58eebe1ace609bedf8c5a65e70a097459f5696 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:15 -0600 Subject: x86/mm: Remove region_is_ram() call from ioremap __ioremap_caller() calls region_is_ram() to walk through the iomem_resource table to check if a target range is in RAM, which was added to improve the lookup performance over page_is_ram() (commit 906e36c5c717 "x86: use optimized ioresource lookup in ioremap function"). page_is_ram() was no longer used when this change was added, though. __ioremap_caller() then calls walk_system_ram_range(), which had replaced page_is_ram() to improve the lookup performance (commit c81c8a1eeede "x86, ioremap: Speed up check for RAM pages"). Since both checks walk through the same iomem_resource table for the same purpose, there is no need to call both functions. Aside of that walk_system_ram_range() is the only useful check at the moment because region_is_ram() always returns -1 due to an implementation bug. That bug in region_is_ram() cannot be fixed without breaking existing ioremap callers, which rely on the subtle difference of walk_system_ram_range() versus non page aligned ranges. Once these offending callers are fixed we can use region_is_ram() and remove walk_system_ram_range(). [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Mike Travis Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-3-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fd3df0ddb4d4..b9d4a33017fd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -92,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pgprot_t prot; int retval; void __iomem *ret_addr; - int ram_region; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -115,26 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - /* First check if whole region can be identified as RAM or not */ - ram_region = region_is_ram(phys_addr, size); - if (ram_region > 0) { - WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n", - (unsigned long int)phys_addr, - (unsigned long int)last_addr); - return NULL; - } - - /* If could not be identified(-1), check page by page */ - if (ram_region < 0) { - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + pfn = phys_addr >> PAGE_SHIFT; + last_pfn = last_addr >> PAGE_SHIFT; + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", phys_addr, last_addr); - return NULL; - } + return NULL; } + /* * Mappings have to be page-aligned */ -- cgit From 10dc331ff5e7e4668c0f0c95b1a873aba9b70826 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:54 +0800 Subject: KVM: MTRR: fix memory type handling if MTRR is completely disabled Currently code uses default memory type if MTRR is fully disabled, fix it by using UC instead. Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index de1d2d8062e2..e2750134a22b 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -120,6 +120,16 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; } +static u8 mtrr_disabled_type(void) +{ + /* + * Intel SDM 11.11.2.2: all MTRRs are disabled when + * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC + * memory type is applied to all of physical memory. + */ + return MTRR_TYPE_UNCACHABLE; +} + /* * Three terms are used in the following code: * - segment, it indicates the address segments covered by fixed MTRRs. @@ -434,6 +444,8 @@ struct mtrr_iter { /* output fields. */ int mem_type; + /* mtrr is completely disabled? */ + bool mtrr_disabled; /* [start, end) is not fully covered in MTRRs? */ bool partial_map; @@ -549,7 +561,7 @@ static void mtrr_lookup_var_next(struct mtrr_iter *iter) static void mtrr_lookup_start(struct mtrr_iter *iter) { if (!mtrr_is_enabled(iter->mtrr_state)) { - iter->partial_map = true; + iter->mtrr_disabled = true; return; } @@ -563,6 +575,7 @@ static void mtrr_lookup_init(struct mtrr_iter *iter, iter->mtrr_state = mtrr_state; iter->start = start; iter->end = end; + iter->mtrr_disabled = false; iter->partial_map = false; iter->fixed = false; iter->range = NULL; @@ -656,6 +669,9 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return MTRR_TYPE_WRBACK; } + if (iter.mtrr_disabled) + return mtrr_disabled_type(); + /* It is not covered by MTRRs. */ if (iter.partial_map) { /* @@ -689,6 +705,9 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } + if (iter.mtrr_disabled) + return true; + if (!iter.partial_map) return true; -- cgit From 3e5d2fdceda172554e681b68c853bf5d08205bbf Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:55 +0800 Subject: KVM: MTRR: simplify kvm_mtrr_get_guest_memory_type kvm_mtrr_get_guest_memory_type never returns -1 which is implied in the current code since if @type = -1 (means no MTRR contains the range), iter.partial_map must be true Simplify the code to indicate this fact Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e2750134a22b..dc0a84a6f309 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -672,15 +672,16 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) if (iter.mtrr_disabled) return mtrr_disabled_type(); - /* It is not covered by MTRRs. */ - if (iter.partial_map) { - /* - * We just check one page, partially covered by MTRRs is - * impossible. - */ - WARN_ON(type != -1); - type = mtrr_default_type(mtrr_state); - } + /* + * We just check one page, partially covered by MTRRs is + * impossible. + */ + WARN_ON(iter.partial_map); + + /* not contained in any MTRRs. */ + if (type == -1) + return mtrr_default_type(mtrr_state); + return type; } EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); -- cgit From 41dbc6bcd9722bba8f09256655d060af30c63d34 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Jul 2015 08:22:45 +0200 Subject: KVM: x86: introduce kvm_check_has_quirk The logic of the disabled_quirks field usually results in a double negation. Wrap it in a simple function that checks the bit and negates it. Based on a patch from Xiao Guangrong. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/x86.h | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 954e98a8c2e3..1c425443a41a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_LINT0_REENABLED)) apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bbc678a66b18..8cbec765b08d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1672,7 +1672,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index edc8cdcd786b..0ca2f3e4803c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } +static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) +{ + return !(kvm->arch.disabled_quirks & quirk); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); void kvm_set_pending_timer(struct kvm_vcpu *vcpu); -- cgit From fb279950ba02e3210a16b11ecfa8871f3ee0ca49 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:56 +0800 Subject: KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED OVMF depends on WB to boot fast, because it only clears caches after it has set up MTRRs---which is too late. Let's do writeback if CR0.CD is set to make it happy, similar to what SVM is already doing. Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b4e9384717a..e9e1fc517ab5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8650,7 +8650,10 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_UNCACHABLE; + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + cache = MTRR_TYPE_WRBACK; + else + cache = MTRR_TYPE_UNCACHABLE; goto exit; } -- cgit From 0da029ed7ee5fdf49a2a0e14160c3e9999be9292 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Jul 2015 08:24:42 +0200 Subject: KVM: x86: rename quirk constants to KVM_X86_QUIRK_* Make them clearly architecture-dependent; the capability is valid for all architectures, but the argument is not. Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 4 ++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a4ae82eb82aa..cd54147cb365 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -354,7 +354,7 @@ struct kvm_xcrs { struct kvm_sync_regs { }; -#define KVM_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1c425443a41a..2a5ca97c263b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8cbec765b08d..8e0c0844c6b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1672,7 +1672,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e9e1fc517ab5..83b7b5cd75d5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8650,7 +8650,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; -- cgit From 8a0a5da6d9cbf1b142115ff6e6b253a82633c3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2015 16:13:43 +0200 Subject: x86/mm: Fix newly introduced printk format warnings Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9d4a33017fd..b9c78f3bcd67 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -118,8 +118,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", - phys_addr, last_addr); + WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", + &phys_addr, &last_addr); return NULL; } -- cgit From c0c3322e98029e752526d906d9e3a680ed213c03 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Jul 2015 14:16:43 +0200 Subject: x86/asm/entry/32: Revert 'Do not use R9 in SYSCALL32' commit This change reverts most of commit 53e9accf0f 'Do not use R9 in SYSCALL32'. I don't yet understand how, but code in that commit sometimes fails to preserve EBP. See https://bugzilla.kernel.org/show_bug.cgi?id=101061 "Problems while executing 32-bit code on AMD64" Reported-and-tested-by: Krzysztof A. Sobiecki Signed-off-by: Denys Vlasenko Cc: Linus Torvalds Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Alexei Starovoitov Cc: Will Drewry Cc: Kees Cook CC: x86@kernel.org Link: http://lkml.kernel.org/r/1437740203-11552-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_64_compat.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bb187a6a877c..5a1844765a7a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -205,7 +205,6 @@ sysexit_from_sys_call: movl RDX(%rsp), %edx /* arg3 */ movl RSI(%rsp), %ecx /* arg4 */ movl RDI(%rsp), %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -236,6 +235,7 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -336,7 +336,7 @@ ENTRY(entry_SYSCALL_compat) * 32-bit zero extended: */ ASM_STAC -1: movl (%r8), %ebp +1: movl (%r8), %r9d _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -346,7 +346,7 @@ ENTRY(entry_SYSCALL_compat) cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ + /* r9 already loaded */ /* arg6 */ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ movl %ebx, %edi /* arg1 */ movl %edx, %edx /* arg3 (zero extension) */ @@ -358,7 +358,6 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: - movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -392,7 +391,9 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: + movl %r9d, R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common + movl R9(%rsp), %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -404,14 +405,16 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif + xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %rax, R9(%rsp) + movq %r9, R9(%rsp) movq %rax, R8(%rsp) movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter + movl R9(%rsp), %r9d /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -421,6 +424,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS + xchgl %ebp, %r9d jmp cstar_do_call END(entry_SYSCALL_compat) -- cgit From 2c534c0da0a68418693e10ce1c4146e085f39518 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 21 Jul 2015 15:55:09 +0100 Subject: perf/x86/intel/cqm: Return cached counter value from IRQ context Peter reported the following potential crash which I was able to reproduce with his test program, [ 148.765788] ------------[ cut here ]------------ [ 148.765796] WARNING: CPU: 34 PID: 2840 at kernel/smp.c:417 smp_call_function_many+0xb6/0x260() [ 148.765797] Modules linked in: [ 148.765800] CPU: 34 PID: 2840 Comm: perf Not tainted 4.2.0-rc1+ #4 [ 148.765803] ffffffff81cdc398 ffff88085f105950 ffffffff818bdfd5 0000000000000007 [ 148.765805] 0000000000000000 ffff88085f105990 ffffffff810e413a 0000000000000000 [ 148.765807] ffffffff82301080 0000000000000022 ffffffff8107f640 ffffffff8107f640 [ 148.765809] Call Trace: [ 148.765810] [] dump_stack+0x45/0x57 [ 148.765818] [] warn_slowpath_common+0x8a/0xc0 [ 148.765822] [] ? intel_cqm_stable+0x60/0x60 [ 148.765824] [] ? intel_cqm_stable+0x60/0x60 [ 148.765825] [] warn_slowpath_null+0x1a/0x20 [ 148.765827] [] smp_call_function_many+0xb6/0x260 [ 148.765829] [] ? intel_cqm_stable+0x60/0x60 [ 148.765831] [] on_each_cpu_mask+0x28/0x60 [ 148.765832] [] intel_cqm_event_count+0x7f/0xe0 [ 148.765836] [] perf_output_read+0x2a5/0x400 [ 148.765839] [] perf_output_sample+0x31a/0x590 [ 148.765840] [] ? perf_prepare_sample+0x26d/0x380 [ 148.765841] [] perf_event_output+0x47/0x60 [ 148.765843] [] __perf_event_overflow+0x215/0x240 [ 148.765844] [] perf_event_overflow+0x14/0x20 [ 148.765847] [] intel_pmu_handle_irq+0x1d4/0x440 [ 148.765849] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765853] [] ? vunmap_page_range+0x19d/0x2f0 [ 148.765854] [] ? unmap_kernel_range_noflush+0x11/0x20 [ 148.765859] [] ? ghes_copy_tofrom_phys+0x11e/0x2a0 [ 148.765863] [] ? native_apic_msr_write+0x2b/0x30 [ 148.765865] [] ? x2apic_send_IPI_self+0x1d/0x20 [ 148.765869] [] ? arch_irq_work_raise+0x35/0x40 [ 148.765872] [] ? irq_work_queue+0x66/0x80 [ 148.765875] [] perf_event_nmi_handler+0x26/0x40 [ 148.765877] [] nmi_handle+0x79/0x100 [ 148.765879] [] default_do_nmi+0x42/0x100 [ 148.765880] [] do_nmi+0x83/0xb0 [ 148.765884] [] end_repeat_nmi+0x1e/0x2e [ 148.765886] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765888] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765890] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765891] <> [] finish_task_switch+0x156/0x210 [ 148.765898] [] __schedule+0x341/0x920 [ 148.765899] [] schedule+0x37/0x80 [ 148.765903] [] ? do_page_fault+0x2f/0x80 [ 148.765905] [] schedule_user+0x1a/0x50 [ 148.765907] [] retint_careful+0x14/0x32 [ 148.765908] ---[ end trace e33ff2be78e14901 ]--- The CQM task events are not safe to be called from within interrupt context because they require performing an IPI to read the counter value on all sockets. And performing IPIs from within IRQ context is a "no-no". Make do with the last read counter value currently event in event->count when we're invoked in this context. Reported-by: Peter Zijlstra Signed-off-by: Matt Fleming Cc: Thomas Gleixner Cc: Vikas Shivappa Cc: Kanaka Juvva Cc: Will Auld Cc: Link: http://lkml.kernel.org/r/1437490509-15373-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 188076161c1b..63eb68b73589 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -951,6 +951,14 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!cqm_group_leader(event)) return 0; + /* + * Getting up-to-date values requires an SMP IPI which is not + * possible if we're being called in interrupt context. Return + * the cached values instead. + */ + if (unlikely(in_interrupt())) + goto out; + /* * Notice that we don't perform the reading of an RMID * atomically, because we can't hold a spin lock across the -- cgit From 1a4e8795711f474b31ff6eac37f3efd304ed8a93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Jul 2015 10:27:37 +0200 Subject: x86/mm/pat: Revert 'Adjust default caching mode translation tables' Toshi explains: "No, the default values need to be set to the fallback types, i.e. minimal supported mode. For WC and WT, UC is the fallback type. When PAT is disabled, pat_init() does update the tables below to enable WT per the default BIOS setup. However, when PAT is enabled, but CPU has PAT -errata, WT falls back to UC per the default values." Revert: ca1fec58bc6a 'x86/mm/pat: Adjust default caching mode translation tables' Requested-by: Toshi Kani Cc: Jan Beulich Link: http://lkml.kernel.org/r/1437577776.3214.252.camel@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a4532229f51..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -- cgit From 2482abb93ebf7bfbf85965ca907f0058ff968c59 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jul 2015 15:26:36 +0200 Subject: ebpf, x86: fix general protection fault when tail call is invoked With eBPF JIT compiler enabled on x86_64, I was able to reliably trigger the following general protection fault out of an eBPF program with a simple tail call, f.e. tracex5 (or a stripped down version of it): [ 927.097918] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [...] [ 927.100870] task: ffff8801f228b780 ti: ffff880016a64000 task.ti: ffff880016a64000 [ 927.102096] RIP: 0010:[] [] 0xffffffffa002440d [ 927.103390] RSP: 0018:ffff880016a67a68 EFLAGS: 00010006 [ 927.104683] RAX: 5a5a5a5a5a5a5a5a RBX: 0000000000000000 RCX: 0000000000000001 [ 927.105921] RDX: 0000000000000000 RSI: ffff88014e438000 RDI: ffff880016a67e00 [ 927.107137] RBP: ffff880016a67c90 R08: 0000000000000000 R09: 0000000000000001 [ 927.108351] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880016a67e00 [ 927.109567] R13: 0000000000000000 R14: ffff88026500e460 R15: ffff880220a81520 [ 927.110787] FS: 00007fe7d5c1f740(0000) GS:ffff880265000000(0000) knlGS:0000000000000000 [ 927.112021] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 927.113255] CR2: 0000003e7bbb91a0 CR3: 000000006e04b000 CR4: 00000000001407e0 [ 927.114500] Stack: [ 927.115737] ffffc90008cdb000 ffff880016a67e00 ffff88026500e460 ffff880220a81520 [ 927.117005] 0000000100000000 000000000000001b ffff880016a67aa8 ffffffff8106c548 [ 927.118276] 00007ffcdaf22e58 0000000000000000 0000000000000000 ffff880016a67ff0 [ 927.119543] Call Trace: [ 927.120797] [] ? lookup_address+0x28/0x30 [ 927.122058] [] ? __module_text_address+0x16/0x70 [ 927.123314] [] ? is_ftrace_trampoline+0x3e/0x70 [ 927.124562] [] ? __kernel_text_address+0x5f/0x80 [ 927.125806] [] ? print_context_stack+0x7f/0xf0 [ 927.127033] [] ? __lock_acquire+0x572/0x2050 [ 927.128254] [] ? __lock_acquire+0x572/0x2050 [ 927.129461] [] ? trace_call_bpf+0x3a/0x140 [ 927.130654] [] trace_call_bpf+0x8a/0x140 [ 927.131837] [] ? trace_call_bpf+0x3a/0x140 [ 927.133015] [] kprobe_perf_func+0x28/0x220 [ 927.134195] [] kprobe_dispatcher+0x38/0x60 [ 927.135367] [] ? seccomp_phase1+0x1/0x230 [ 927.136523] [] kprobe_ftrace_handler+0xf0/0x150 [ 927.137666] [] ? seccomp_phase1+0x5/0x230 [ 927.138802] [] ftrace_ops_recurs_func+0x5c/0xb0 [ 927.139934] [] 0xffffffffa022b0d5 [ 927.141066] [] ? seccomp_phase1+0x1/0x230 [ 927.142199] [] seccomp_phase1+0x5/0x230 [ 927.143323] [] syscall_trace_enter_phase1+0xc4/0x150 [ 927.144450] [] ? seccomp_phase1+0x5/0x230 [ 927.145572] [] ? syscall_trace_enter_phase1+0xc4/0x150 [ 927.146666] [] tracesys+0xd/0x44 [ 927.147723] Code: 48 8b 46 10 48 39 d0 76 2c 8b 85 fc fd ff ff 83 f8 20 77 21 83 c0 01 89 85 fc fd ff ff 48 8d 44 d6 80 48 8b 00 48 83 f8 00 74 0a <48> 8b 40 20 48 83 c0 33 ff e0 48 89 d8 48 8b 9d d8 fd ff ff 4c [ 927.150046] RIP [] 0xffffffffa002440d The code section with the instructions that traps points into the eBPF JIT image of the root program (the one invoking the tail call instruction). Using bpf_jit_disasm -o on the eBPF root program image: [...] 4e: mov -0x204(%rbp),%eax 8b 85 fc fd ff ff 54: cmp $0x20,%eax <--- if (tail_call_cnt > MAX_TAIL_CALL_CNT) 83 f8 20 57: ja 0x000000000000007a 77 21 59: add $0x1,%eax <--- tail_call_cnt++ 83 c0 01 5c: mov %eax,-0x204(%rbp) 89 85 fc fd ff ff 62: lea -0x80(%rsi,%rdx,8),%rax <--- prog = array->prog[index] 48 8d 44 d6 80 67: mov (%rax),%rax 48 8b 00 6a: cmp $0x0,%rax <--- check for NULL 48 83 f8 00 6e: je 0x000000000000007a 74 0a 70: mov 0x20(%rax),%rax <--- GPF triggered here! fetch of bpf_func 48 8b 40 20 [ matches <48> 8b 40 20 ... from above ] 74: add $0x33,%rax <--- prologue skip of new prog 48 83 c0 33 78: jmpq *%rax <--- jump to new prog insns ff e0 [...] The problem is that rax has 5a5a5a5a5a5a5a5a, which suggests a tail call jump to map slot 0 is pointing to a poisoned page. The issue is the following: lea instruction has a wrong offset, i.e. it should be ... lea 0x80(%rsi,%rdx,8),%rax ... but it actually seems to be ... lea -0x80(%rsi,%rdx,8),%rax ... where 0x80 is offsetof(struct bpf_array, prog), thus the offset needs to be positive instead of negative. Disassembling the interpreter, we btw similarly do: [...] c88: lea 0x80(%rax,%rdx,8),%rax <--- prog = array->prog[index] 48 8d 84 d0 80 00 00 00 c90: add $0x1,%r13d 41 83 c5 01 c94: mov (%rax),%rax 48 8b 00 [...] Now the other interesting fact is that this panic triggers only when things like CONFIG_LOCKDEP are being used. In that case offsetof(struct bpf_array, prog) starts at offset 0x80 and in non-CONFIG_LOCKDEP case at offset 0x50. Reason is that the work_struct inside struct bpf_map grows by 48 bytes in my case due to the lockdep_map member (which also has CONFIG_LOCK_STAT enabled members). Changing the emitter to always use the 4 byte displacement in the lea instruction fixes the panic on my side. It increases the tail call instruction emission by 3 more byte, but it should cover us from various combinations (and perhaps other future increases on related structures). After patch, disassembly: [...] 9e: lea 0x80(%rsi,%rdx,8),%rax <--- CONFIG_LOCKDEP/CONFIG_LOCK_STAT 48 8d 84 d6 80 00 00 00 a6: mov (%rax),%rax 48 8b 00 [...] [...] 9e: lea 0x50(%rsi,%rdx,8),%rax <--- No CONFIG_LOCKDEP 48 8d 84 d6 50 00 00 00 a6: mov (%rax),%rax 48 8b 00 [...] Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd74be0..be2e7a2b10d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -269,7 +269,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 44 /* number of bytes to jump */ +#define OFFSET1 47 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -278,15 +278,15 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 33 +#define OFFSET2 36 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ /* prog = array->prog[index]; */ - EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ - EMIT1(offsetof(struct bpf_array, prog)); + EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + offsetof(struct bpf_array, prog)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) -- cgit