From 21cdb6b568435738cc0b303b2b3b82742396310c Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:30 +0000
Subject: x86/mm: Page align the '_end' symbol to avoid pfn conversion bugs

Ingo noted that if we can guarantee _end is aligned to PAGE_SIZE
we can automatically avoid bugs along the lines of,

	size = _end - _text >> PAGE_SHIFT

which is missing a call to PFN_ALIGN(). The EFI mixed mode
contains this bug, for example.

_text is already aligned to PAGE_SIZE through the use of
LOAD_PHYSICAL_ADDR, and the BSS and BRK sections are explicitly
aligned in the linker script, so it makes sense to align _end to
match.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..4f1994257a18 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -325,6 +325,7 @@ SECTIONS
 		__brk_limit = .;
 	}
 
+	. = ALIGN(PAGE_SIZE);
 	_end = .;
 
         STABS_DEBUG
-- 
cgit 


From edc3b9129cecd0f0857112136f5b8b1bc1d45918 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:31 +0000
Subject: x86/mm/pat: Ensure cpa->pfn only contains page frame numbers

The x86 pageattr code is confused about the data that is stored
in cpa->pfn, sometimes it's treated as a page frame number,
sometimes it's treated as an unshifted physical address, and in
one place it's treated as a pte.

The result of this is that the mapping functions do not map the
intended physical address.

This isn't a problem in practice because most of the addresses
we're mapping in the EFI code paths are already mapped in
'trampoline_pgd' and so the pageattr mapping functions don't
actually do anything in this case. But when we move to using a
separate page table for the EFI runtime this will be an issue.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pageattr.c         | 17 ++++++-----------
 arch/x86/platform/efi/efi_64.c | 16 ++++++++++------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4feed1..c70e42014101 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -905,15 +905,10 @@ static void populate_pte(struct cpa_data *cpa,
 	pte = pte_offset_kernel(pmd, start);
 
 	while (num_pages-- && start < end) {
-
-		/* deal with the NX bit */
-		if (!(pgprot_val(pgprot) & _PAGE_NX))
-			cpa->pfn &= ~_PAGE_NX;
-
-		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 
 		start	 += PAGE_SIZE;
-		cpa->pfn += PAGE_SIZE;
+		cpa->pfn++;
 		pte++;
 	}
 }
@@ -969,11 +964,11 @@ static int populate_pmd(struct cpa_data *cpa,
 
 		pmd = pmd_offset(pud, start);
 
-		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+		set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pmd_pgprot)));
 
 		start	  += PMD_SIZE;
-		cpa->pfn  += PMD_SIZE;
+		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
 		cur_pages += PMD_SIZE >> PAGE_SHIFT;
 	}
 
@@ -1042,11 +1037,11 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
 	while (end - start >= PUD_SIZE) {
-		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pud_pgprot)));
 
 		start	  += PUD_SIZE;
-		cpa->pfn  += PUD_SIZE;
+		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
 		cur_pages += PUD_SIZE >> PAGE_SHIFT;
 		pud++;
 	}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a0ac0f9c307f..5aa186db59e3 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -143,7 +143,7 @@ void efi_sync_low_kernel_mappings(void)
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	unsigned long text;
+	unsigned long pfn, text;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -160,7 +160,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * and ident-map those pages containing the map before calling
 	 * phys_efi_set_virtual_address_map().
 	 */
-	if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
+	pfn = pa_memmap >> PAGE_SHIFT;
+	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX)) {
 		pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
 		return 1;
 	}
@@ -185,8 +186,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 	npages = (_end - _text) >> PAGE_SHIFT;
 	text = __pa(_text);
+	pfn = text >> PAGE_SHIFT;
 
-	if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
+	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, 0)) {
 		pr_err("Failed to map kernel text 1:1\n");
 		return 1;
 	}
@@ -204,12 +206,14 @@ void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
 	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-	unsigned long pf = 0;
+	unsigned long flags = 0;
+	unsigned long pfn;
 
 	if (!(md->attribute & EFI_MEMORY_WB))
-		pf |= _PAGE_PCD;
+		flags |= _PAGE_PCD;
 
-	if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+	pfn = md->phys_addr >> PAGE_SHIFT;
+	if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
 		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
 			   md->phys_addr, va);
 }
-- 
cgit 


From b61a76f8850d2979550abc42d7e09154ebb8d785 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:32 +0000
Subject: x86/efi: Map RAM into the identity page table for mixed mode

We are relying on the pre-existing mappings in 'trampoline_pgd'
when accessing function arguments in the EFI mixed mode thunking
code.

Instead let's map memory explicitly so that things will continue
to work when we move to a separate page table in the future.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-4-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 5aa186db59e3..102976dda8c4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -144,6 +144,7 @@ void efi_sync_low_kernel_mappings(void)
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
 	unsigned long pfn, text;
+	efi_memory_desc_t *md;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -177,6 +178,25 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (!IS_ENABLED(CONFIG_EFI_MIXED))
 		return 0;
 
+	/*
+	 * Map all of RAM so that we can access arguments in the 1:1
+	 * mapping when making EFI runtime calls.
+	 */
+	for_each_efi_memory_desc(&memmap, md) {
+		if (md->type != EFI_CONVENTIONAL_MEMORY &&
+		    md->type != EFI_LOADER_DATA &&
+		    md->type != EFI_LOADER_CODE)
+			continue;
+
+		pfn = md->phys_addr >> PAGE_SHIFT;
+		npages = md->num_pages;
+
+		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, 0)) {
+			pr_err("Failed to map 1:1 memory\n");
+			return 1;
+		}
+	}
+
 	page = alloc_page(GFP_KERNEL|__GFP_DMA32);
 	if (!page)
 		panic("Unable to allocate EFI runtime stack < 4GB\n");
-- 
cgit 


From c9f2a9a65e4855b74d92cdad688f6ee4a1a323ff Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:33 +0000
Subject: x86/efi: Hoist page table switching code into efi_call_virt()

This change is a prerequisite for pending patches that switch to
a dedicated EFI page table, instead of using 'trampoline_pgd'
which shares PGD entries with 'swapper_pg_dir'. The pending
patches make it impossible to dereference the runtime service
function pointer without first switching %cr3.

It's true that we now have duplicated switching code in
efi_call_virt() and efi_call_phys_{prolog,epilog}() but we are
sacrificing code duplication for a little more clarity and the
ease of writing the page table switching code in C instead of
asm.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-5-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/efi.h          | 25 +++++++++++++++++++++
 arch/x86/platform/efi/efi_64.c      | 24 ++++++++++-----------
 arch/x86/platform/efi/efi_stub_64.S | 43 -------------------------------------
 3 files changed, 36 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78c4998..347eeacb06a8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,7 @@
 
 #include <asm/fpu/api.h>
 #include <asm/pgtable.h>
+#include <asm/tlb.h>
 
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
@@ -64,6 +65,17 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 
 #define efi_call_phys(f, args...)		efi_call((f), args)
 
+/*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+	u64	r15;
+	u64	prev_cr3;
+	pgd_t	*efi_pgt;
+	bool	use_pgd;
+	u64	phys_stack;
+} __packed;
+
 #define efi_call_virt(f, ...)						\
 ({									\
 	efi_status_t __s;						\
@@ -71,7 +83,20 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 	efi_sync_low_kernel_mappings();					\
 	preempt_disable();						\
 	__kernel_fpu_begin();						\
+									\
+	if (efi_scratch.use_pgd) {					\
+		efi_scratch.prev_cr3 = read_cr3();			\
+		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
+		__flush_tlb_all();					\
+	}								\
+									\
 	__s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__);	\
+									\
+	if (efi_scratch.use_pgd) {					\
+		write_cr3(efi_scratch.prev_cr3);			\
+		__flush_tlb_all();					\
+	}								\
+									\
 	__kernel_fpu_end();						\
 	preempt_enable();						\
 	__s;								\
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 102976dda8c4..b19cdac959b2 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,16 +47,7 @@
  */
 static u64 efi_va = EFI_VA_START;
 
-/*
- * Scratch space used for switching the pagetable in the EFI stub
- */
-struct efi_scratch {
-	u64 r15;
-	u64 prev_cr3;
-	pgd_t *efi_pgt;
-	bool use_pgd;
-	u64 phys_stack;
-} __packed;
+struct efi_scratch efi_scratch;
 
 static void __init early_code_mapping_set_exec(int executable)
 {
@@ -83,8 +74,11 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int pgd;
 	int n_pgds;
 
-	if (!efi_enabled(EFI_OLD_MEMMAP))
-		return NULL;
+	if (!efi_enabled(EFI_OLD_MEMMAP)) {
+		save_pgd = (pgd_t *)read_cr3();
+		write_cr3((unsigned long)efi_scratch.efi_pgt);
+		goto out;
+	}
 
 	early_code_mapping_set_exec(1);
 
@@ -96,6 +90,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
 		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
 	}
+out:
 	__flush_tlb_all();
 
 	return save_pgd;
@@ -109,8 +104,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	int pgd_idx;
 	int nr_pgds;
 
-	if (!save_pgd)
+	if (!efi_enabled(EFI_OLD_MEMMAP)) {
+		write_cr3((unsigned long)save_pgd);
+		__flush_tlb_all();
 		return;
+	}
 
 	nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
 
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 86d0f9e08dd9..32020cb8bb08 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -38,41 +38,6 @@
 	mov %rsi, %cr0;			\
 	mov (%rsp), %rsp
 
-	/* stolen from gcc */
-	.macro FLUSH_TLB_ALL
-	movq %r15, efi_scratch(%rip)
-	movq %r14, efi_scratch+8(%rip)
-	movq %cr4, %r15
-	movq %r15, %r14
-	andb $0x7f, %r14b
-	movq %r14, %cr4
-	movq %r15, %cr4
-	movq efi_scratch+8(%rip), %r14
-	movq efi_scratch(%rip), %r15
-	.endm
-
-	.macro SWITCH_PGT
-	cmpb $0, efi_scratch+24(%rip)
-	je 1f
-	movq %r15, efi_scratch(%rip)		# r15
-	# save previous CR3
-	movq %cr3, %r15
-	movq %r15, efi_scratch+8(%rip)		# prev_cr3
-	movq efi_scratch+16(%rip), %r15		# EFI pgt
-	movq %r15, %cr3
-	1:
-	.endm
-
-	.macro RESTORE_PGT
-	cmpb $0, efi_scratch+24(%rip)
-	je 2f
-	movq efi_scratch+8(%rip), %r15
-	movq %r15, %cr3
-	movq efi_scratch(%rip), %r15
-	FLUSH_TLB_ALL
-	2:
-	.endm
-
 ENTRY(efi_call)
 	SAVE_XMM
 	mov (%rsp), %rax
@@ -83,16 +48,8 @@ ENTRY(efi_call)
 	mov %r8, %r9
 	mov %rcx, %r8
 	mov %rsi, %rcx
-	SWITCH_PGT
 	call *%rdi
-	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
 	ret
 ENDPROC(efi_call)
-
-	.data
-ENTRY(efi_scratch)
-	.fill 3,8,0
-	.byte 0
-	.quad 0
-- 
cgit 


From 67a9108ed4313b85a9c53406d80dc1ae3f8c3e36 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 27 Nov 2015 21:09:34 +0000
Subject: x86/efi: Build our own page table structures

With commit e1a58320a38d ("x86/mm: Warn on W^X mappings") all
users booting on 64-bit UEFI machines see the following warning,

  ------------[ cut here ]------------
  WARNING: CPU: 7 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x5dc/0x780()
  x86/mm: Found insecure W+X mapping at address ffff88000005f000/0xffff88000005f000
  ...
  x86/mm: Checked W+X mappings: FAILED, 165660 W+X pages found.
  ...

This is caused by mapping EFI regions with RWX permissions.
There isn't much we can do to restrict the permissions for these
regions due to the way the firmware toolchains mix code and
data, but we can at least isolate these mappings so that they do
not appear in the regular kernel page tables.

In commit d2f7cbe7b26a ("x86/efi: Runtime services virtual
mapping") we started using 'trampoline_pgd' to map the EFI
regions because there was an existing identity mapping there
which we use during the SetVirtualAddressMap() call and for
broken firmware that accesses those addresses.

But 'trampoline_pgd' shares some PGD entries with
'swapper_pg_dir' and does not provide the isolation we require.
Notably the virtual address for __START_KERNEL_map and
MODULES_START are mapped by the same PGD entry so we need to be
more careful when copying changes over in
efi_sync_low_kernel_mappings().

This patch doesn't go the full mile, we still want to share some
PGD entries with 'swapper_pg_dir'. Having completely separate
page tables brings its own issues such as synchronising new
mappings after memory hotplug and module loading. Sharing also
keeps memory usage down.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1448658575-17029-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/efi.h     |  1 +
 arch/x86/platform/efi/efi.c    | 39 ++++++-----------
 arch/x86/platform/efi/efi_32.c |  5 +++
 arch/x86/platform/efi/efi_64.c | 97 +++++++++++++++++++++++++++++++++++-------
 4 files changed, 102 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 347eeacb06a8..8fd9e637629a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -136,6 +136,7 @@ extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
 extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
 extern void efi_sync_low_kernel_mappings(void);
+extern int __init efi_alloc_page_tables(void);
 extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
 extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
 extern void __init old_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ad285404ea7f..3c1f3cd7b2ba 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -869,7 +869,7 @@ static void __init kexec_enter_virtual_mode(void)
  * This function will switch the EFI runtime services to virtual mode.
  * Essentially, we look through the EFI memmap and map every region that
  * has the runtime attribute bit set in its memory descriptor into the
- * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ * efi_pgd page table.
  *
  * The old method which used to update that memory descriptor with the
  * virtual address obtained from ioremap() is still supported when the
@@ -879,8 +879,8 @@ static void __init kexec_enter_virtual_mode(void)
  *
  * The new method does a pagetable switch in a preemption-safe manner
  * so that we're in a different address space when calling a runtime
- * function. For function arguments passing we do copy the PGDs of the
- * kernel page table into ->trampoline_pgd prior to each call.
+ * function. For function arguments passing we do copy the PUDs of the
+ * kernel page table into efi_pgd prior to each call.
  *
  * Specially for kexec boot, efi runtime maps in previous kernel should
  * be passed in via setup_data. In that case runtime ranges will be mapped
@@ -895,6 +895,12 @@ static void __init __efi_enter_virtual_mode(void)
 
 	efi.systab = NULL;
 
+	if (efi_alloc_page_tables()) {
+		pr_err("Failed to allocate EFI page tables\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
+
 	efi_merge_regions();
 	new_memmap = efi_map_regions(&count, &pg_shift);
 	if (!new_memmap) {
@@ -954,28 +960,11 @@ static void __init __efi_enter_virtual_mode(void)
 	efi_runtime_mkexec();
 
 	/*
-	 * We mapped the descriptor array into the EFI pagetable above but we're
-	 * not unmapping it here. Here's why:
-	 *
-	 * We're copying select PGDs from the kernel page table to the EFI page
-	 * table and when we do so and make changes to those PGDs like unmapping
-	 * stuff from them, those changes appear in the kernel page table and we
-	 * go boom.
-	 *
-	 * From setup_real_mode():
-	 *
-	 * ...
-	 * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
-	 *
-	 * In this particular case, our allocation is in PGD 0 of the EFI page
-	 * table but we've copied that PGD from PGD[272] of the EFI page table:
-	 *
-	 *	pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
-	 *
-	 * where the direct memory mapping in kernel space is.
-	 *
-	 * new_memmap's VA comes from that direct mapping and thus clearing it,
-	 * it would get cleared in the kernel page table too.
+	 * We mapped the descriptor array into the EFI pagetable above
+	 * but we're not unmapping it here because if we're running in
+	 * EFI mixed mode we need all of memory to be accessible when
+	 * we pass parameters to the EFI runtime services in the
+	 * thunking code.
 	 *
 	 * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
 	 */
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index ed5b67338294..58d669bc8250 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -38,6 +38,11 @@
  * say 0 - 3G.
  */
 
+int __init efi_alloc_page_tables(void)
+{
+	return 0;
+}
+
 void efi_sync_low_kernel_mappings(void) {}
 void __init efi_dump_pagetable(void) {}
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b19cdac959b2..4897f518760f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -40,6 +40,7 @@
 #include <asm/fixmap.h>
 #include <asm/realmode.h>
 #include <asm/time.h>
+#include <asm/pgalloc.h>
 
 /*
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
@@ -121,22 +122,92 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	early_code_mapping_set_exec(0);
 }
 
+static pgd_t *efi_pgd;
+
+/*
+ * We need our own copy of the higher levels of the page tables
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
+ * to EFI_VA_START) into the standard kernel page tables. Everything
+ * else can be shared, see efi_sync_low_kernel_mappings().
+ */
+int __init efi_alloc_page_tables(void)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	gfp_t gfp_mask;
+
+	if (efi_enabled(EFI_OLD_MEMMAP))
+		return 0;
+
+	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO;
+	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	if (!efi_pgd)
+		return -ENOMEM;
+
+	pgd = efi_pgd + pgd_index(EFI_VA_END);
+
+	pud = pud_alloc_one(NULL, 0);
+	if (!pud) {
+		free_page((unsigned long)efi_pgd);
+		return -ENOMEM;
+	}
+
+	pgd_populate(NULL, pgd, pud);
+
+	return 0;
+}
+
 /*
  * Add low kernel mappings for passing arguments to EFI functions.
  */
 void efi_sync_low_kernel_mappings(void)
 {
-	unsigned num_pgds;
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+	unsigned num_entries;
+	pgd_t *pgd_k, *pgd_efi;
+	pud_t *pud_k, *pud_efi;
 
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return;
 
-	num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
+	/*
+	 * We can share all PGD entries apart from the one entry that
+	 * covers the EFI runtime mapping space.
+	 *
+	 * Make sure the EFI runtime region mappings are guaranteed to
+	 * only span a single PGD entry and that the entry also maps
+	 * other important kernel regions.
+	 */
+	BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+	BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+			(EFI_VA_END & PGDIR_MASK));
+
+	pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
+	pgd_k = pgd_offset_k(PAGE_OFFSET);
+
+	num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
+	memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
 
-	memcpy(pgd + pgd_index(PAGE_OFFSET),
-		init_mm.pgd + pgd_index(PAGE_OFFSET),
-		sizeof(pgd_t) * num_pgds);
+	/*
+	 * We share all the PUD entries apart from those that map the
+	 * EFI regions. Copy around them.
+	 */
+	BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
+	BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
+
+	pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+	pud_efi = pud_offset(pgd_efi, 0);
+
+	pgd_k = pgd_offset_k(EFI_VA_END);
+	pud_k = pud_offset(pgd_k, 0);
+
+	num_entries = pud_index(EFI_VA_END);
+	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+
+	pud_efi = pud_offset(pgd_efi, EFI_VA_START);
+	pud_k = pud_offset(pgd_k, EFI_VA_START);
+
+	num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
+	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
 }
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
@@ -150,8 +221,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
-	pgd = __va(efi_scratch.efi_pgt);
+	efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+	pgd = efi_pgd;
 
 	/*
 	 * It can happen that the physical address of new_memmap lands in memory
@@ -216,16 +287,14 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
-	kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
+	kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages);
 }
 
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
 	unsigned long flags = 0;
 	unsigned long pfn;
+	pgd_t *pgd = efi_pgd;
 
 	if (!(md->attribute & EFI_MEMORY_WB))
 		flags |= _PAGE_PCD;
@@ -334,9 +403,7 @@ void __init efi_runtime_mkexec(void)
 void __init efi_dump_pagetable(void)
 {
 #ifdef CONFIG_EFI_PGT_DUMP
-	pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
-	ptdump_walk_pgd_level(NULL, pgd);
+	ptdump_walk_pgd_level(NULL, efi_pgd);
 #endif
 }
 
-- 
cgit 


From 26d7f65fbd22168c33d2350f3e7e3021f5761256 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Sun, 25 Oct 2015 10:26:35 +0000
Subject: x86/efi: Preface all print statements with efi* tag

The pr_*() calls in the x86 EFI code may or may not include a
subsystem tag, which makes it difficult to grep the kernel log for all
relevant EFI messages and leads users to miss important information.

Recently, a bug reporter provided all the EFI print messages from the
kernel log when trying to diagnose an issue but missed the following
statement because it wasn't prefixed with anything indicating it was
related to EFI,

  pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);

Cc: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 arch/x86/platform/efi/efi-bgrt.c | 3 +++
 arch/x86/platform/efi/efi_64.c   | 2 ++
 arch/x86/platform/efi/quirks.c   | 4 +++-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index ea48449b2e63..9a52b5c4438f 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -10,6 +10,9 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/acpi.h>
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a0ac0f9c307f..d347e854a5e4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -15,6 +15,8 @@
  *
  */
 
+#define pr_fmt(fmt) "efi: " fmt
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/mm.h>
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 1c7380da65ff..6452070f3025 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "efi: " fmt
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -256,7 +258,7 @@ void __init efi_apply_memmap_quirks(void)
 	 * services.
 	 */
 	if (!efi_runtime_supported()) {
-		pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+		pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
 		efi_unmap_memmap();
 	}
 
-- 
cgit 


From 50a0cb565246f20d59cdb161778531e4b19d35ac Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 9 Dec 2015 15:41:08 -0800
Subject: x86/efi-bgrt: Fix kernel panic when mapping BGRT data

Starting with this commit 35eb8b81edd4 ("x86/efi: Build our own page
table structures") efi regions have a separate page directory called
"efi_pgd". In order to access any efi region we have to first shift %cr3
to this page table. In the bgrt code we are trying to copy bgrt_header
and image, but these regions fall under "EFI_BOOT_SERVICES_DATA"
and to access these regions we have to shift %cr3 to efi_pgd and not
doing so will cause page fault as shown below.

[    0.251599] Last level dTLB entries: 4KB 64, 2MB 0, 4MB 0, 1GB 4
[    0.259126] Freeing SMP alternatives memory: 32K (ffffffff8230e000 - ffffffff82316000)
[    0.271803] BUG: unable to handle kernel paging request at fffffffefce35002
[    0.279740] IP: [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.286383] PGD 300f067 PUD 0
[    0.289879] Oops: 0000 [#1] SMP
[    0.293566] Modules linked in:
[    0.297039] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc1-eywa-eywa-built-in-47041+ #2
[    0.306619] Hardware name: Intel Corporation Skylake Client platform/Skylake Y LPDDR3 RVP3, BIOS SKLSE2R1.R00.B104.B01.1511110114 11/11/2015
[    0.320925] task: ffffffff820134c0 ti: ffffffff82000000 task.ti: ffffffff82000000
[    0.329420] RIP: 0010:[<ffffffff821bca49>]  [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.338821] RSP: 0000:ffffffff82003f18  EFLAGS: 00010246
[    0.344852] RAX: fffffffefce35000 RBX: fffffffefce35000 RCX: fffffffefce2b000
[    0.352952] RDX: 000000008a82b000 RSI: ffffffff8235bb80 RDI: 000000008a835000
[    0.361050] RBP: ffffffff82003f30 R08: 000000008a865000 R09: ffffffffff202850
[    0.369149] R10: ffffffff811ad62f R11: 0000000000000000 R12: 0000000000000000
[    0.377248] R13: ffff88016dbaea40 R14: ffffffff822622c0 R15: ffffffff82003fb0
[    0.385348] FS:  0000000000000000(0000) GS:ffff88016d800000(0000) knlGS:0000000000000000
[    0.394533] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.401054] CR2: fffffffefce35002 CR3: 000000000300c000 CR4: 00000000003406f0
[    0.409153] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.417252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    0.425350] Stack:
[    0.427638]  ffffffffffffffff ffffffff82256900 ffff88016dbaea40 ffffffff82003f40
[    0.436086]  ffffffff821bbce0 ffffffff82003f88 ffffffff8219c0c2 0000000000000000
[    0.444533]  ffffffff8219ba4a ffffffff822622c0 0000000000083000 00000000ffffffff
[    0.452978] Call Trace:
[    0.455763]  [<ffffffff821bbce0>] efi_late_init+0x9/0xb
[    0.461697]  [<ffffffff8219c0c2>] start_kernel+0x463/0x47f
[    0.467928]  [<ffffffff8219ba4a>] ? set_init_arg+0x55/0x55
[    0.474159]  [<ffffffff8219b120>] ? early_idt_handler_array+0x120/0x120
[    0.481669]  [<ffffffff8219b5ee>] x86_64_start_reservations+0x2a/0x2c
[    0.488982]  [<ffffffff8219b72d>] x86_64_start_kernel+0x13d/0x14c
[    0.495897] Code: 00 41 b4 01 48 8b 78 28 e8 09 36 01 00 48 85 c0 48 89 c3 75 13 48 c7 c7 f8 ac d3 81 31 c0 e8 d7 3b fb fe e9 b5 00 00 00 45 84 e4 <44> 8b 6b 02 74 0d be 06 00 00 00 48 89 df e8 ae 34 0$
[    0.518151] RIP  [<ffffffff821bca49>] efi_bgrt_init+0x144/0x1fd
[    0.524888]  RSP <ffffffff82003f18>
[    0.528851] CR2: fffffffefce35002
[    0.532615] ---[ end trace 7b06521e6ebf2aea ]---
[    0.537852] Kernel panic - not syncing: Attempted to kill the idle task!

As said above one way to fix this bug is to shift %cr3 to efi_pgd but we
are not doing that way because it leaks inner details of how we switch
to EFI page tables into a new call site and it also adds duplicate code.
Instead, we remove the call to efi_lookup_mapped_addr() and always
perform early_mem*() instead of early_io*() because we want to remap RAM
regions and not I/O regions. We also delete efi_lookup_mapped_addr()
because we are no longer using it.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Reported-by: Wendy Wang <wendy.wang@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 arch/x86/platform/efi/efi-bgrt.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 9a52b5c4438f..bf51f4c02562 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -31,8 +31,7 @@ struct bmp_header {
 void __init efi_bgrt_init(void)
 {
 	acpi_status status;
-	void __iomem *image;
-	bool ioremapped = false;
+	void *image;
 	struct bmp_header bmp_header;
 
 	if (acpi_disabled)
@@ -73,20 +72,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = efi_lookup_mapped_addr(bgrt_tab->image_address);
+	image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header));
 	if (!image) {
-		image = early_ioremap(bgrt_tab->image_address,
-				       sizeof(bmp_header));
-		ioremapped = true;
-		if (!image) {
-			pr_err("Ignoring BGRT: failed to map image header memory\n");
-			return;
-		}
+		pr_err("Ignoring BGRT: failed to map image header memory\n");
+		return;
 	}
 
-	memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
-	if (ioremapped)
-		early_iounmap(image, sizeof(bmp_header));
+	memcpy(&bmp_header, image, sizeof(bmp_header));
+	early_memunmap(image, sizeof(bmp_header));
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
@@ -96,18 +89,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	if (ioremapped) {
-		image = early_ioremap(bgrt_tab->image_address,
-				       bmp_header.size);
-		if (!image) {
-			pr_err("Ignoring BGRT: failed to map image memory\n");
-			kfree(bgrt_image);
-			bgrt_image = NULL;
-			return;
-		}
+	image = early_memremap(bgrt_tab->image_address, bmp_header.size);
+	if (!image) {
+		pr_err("Ignoring BGRT: failed to map image memory\n");
+		kfree(bgrt_image);
+		bgrt_image = NULL;
+		return;
 	}
 
-	memcpy_fromio(bgrt_image, image, bgrt_image_size);
-	if (ioremapped)
-		early_iounmap(image, bmp_header.size);
+	memcpy(bgrt_image, image, bgrt_image_size);
+	early_memunmap(image, bmp_header.size);
 }
-- 
cgit 


From e2c90dd7e11e3025b46719a79fb4bb1e7a5cef9f Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Mon, 21 Dec 2015 14:12:52 +0000
Subject: x86/efi-bgrt: Replace early_memremap() with memremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Môshe reported the following warning triggered on his machine since
commit 50a0cb565246 ("x86/efi-bgrt: Fix kernel panic when mapping BGRT
data"),

  [    0.026936] ------------[ cut here ]------------
  [    0.026941] WARNING: CPU: 0 PID: 0 at mm/early_ioremap.c:137 __early_ioremap+0x102/0x1bb()
  [    0.026941] Modules linked in:
  [    0.026944] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc1 #2
  [    0.026945] Hardware name: Dell Inc. XPS 13 9343/09K8G1, BIOS A05 07/14/2015
  [    0.026946]  0000000000000000 900f03d5a116524d ffffffff81c03e60 ffffffff813a3fff
  [    0.026948]  0000000000000000 ffffffff81c03e98 ffffffff810a0852 00000000d7b76000
  [    0.026949]  0000000000000000 0000000000000001 0000000000000001 000000000000017c
  [    0.026951] Call Trace:
  [    0.026955]  [<ffffffff813a3fff>] dump_stack+0x44/0x55
  [    0.026958]  [<ffffffff810a0852>] warn_slowpath_common+0x82/0xc0
  [    0.026959]  [<ffffffff810a099a>] warn_slowpath_null+0x1a/0x20
  [    0.026961]  [<ffffffff81d8c395>] __early_ioremap+0x102/0x1bb
  [    0.026962]  [<ffffffff81d8c602>] early_memremap+0x13/0x15
  [    0.026964]  [<ffffffff81d78361>] efi_bgrt_init+0x162/0x1ad
  [    0.026966]  [<ffffffff81d778ec>] efi_late_init+0x9/0xb
  [    0.026968]  [<ffffffff81d58ff5>] start_kernel+0x46f/0x49f
  [    0.026970]  [<ffffffff81d58120>] ? early_idt_handler_array+0x120/0x120
  [    0.026972]  [<ffffffff81d58339>] x86_64_start_reservations+0x2a/0x2c
  [    0.026974]  [<ffffffff81d58485>] x86_64_start_kernel+0x14a/0x16d
  [    0.026977] ---[ end trace f9b3812eb8e24c58 ]---
  [    0.026978] efi_bgrt: Ignoring BGRT: failed to map image memory

early_memremap() has an upper limit on the size of mapping it can
handle which is ~200KB. Clearly the BGRT image on Môshe's machine is
much larger than that.

There's actually no reason to restrict ourselves to using the early_*
version of memremap() - the ACPI BGRT driver is invoked late enough in
boot that we can use the standard version, with the benefit that the
late version allows mappings of arbitrary size.

Reported-by: Môshe van der Sterre <me@moshe.nl>
Tested-by: Môshe van der Sterre <me@moshe.nl>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1450707172-12561-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/platform/efi/efi-bgrt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index bf51f4c02562..b0970661870a 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -72,14 +72,14 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header));
+	image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB);
 	if (!image) {
 		pr_err("Ignoring BGRT: failed to map image header memory\n");
 		return;
 	}
 
 	memcpy(&bmp_header, image, sizeof(bmp_header));
-	early_memunmap(image, sizeof(bmp_header));
+	memunmap(image);
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
 		return;
 	}
 
-	image = early_memremap(bgrt_tab->image_address, bmp_header.size);
+	image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
 	if (!image) {
 		pr_err("Ignoring BGRT: failed to map image memory\n");
 		kfree(bgrt_image);
@@ -98,5 +98,5 @@ void __init efi_bgrt_init(void)
 	}
 
 	memcpy(bgrt_image, image, bgrt_image_size);
-	early_memunmap(image, bmp_header.size);
+	memunmap(image);
 }
-- 
cgit 


From 352b78c62f27b356b182008acd3117f3ee03ffd2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:21 -0800
Subject: x86/vdso: Track each mm's loaded vDSO image as well as its base

As we start to do more intelligent things with the vDSO at
runtime (as opposed to just at mm initialization time), we'll
need to know which vDSO is in use.

In principle, we could guess based on the mm type, but that's
over-complicated and error-prone.  Instead, just track it in the
mmu context.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c99ac48681bad709ca7ad5ee899d9042a3af6b00.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vma.c  | 1 +
 arch/x86/include/asm/mmu.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f69e264ac4..80b021067bd6 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -121,6 +121,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 
 	text_start = addr - image->sym_vvar_start;
 	current->mm->context.vdso = (void __user *)text_start;
+	current->mm->context.vdso_image = image;
 
 	/*
 	 * MAYWRITE to allow gdb to COW and set breakpoints
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 55234d5e7160..1ea0baef1175 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,7 +19,8 @@ typedef struct {
 #endif
 
 	struct mutex lock;
-	void __user *vdso;
+	void __user *vdso;			/* vdso base address */
+	const struct vdso_image *vdso_image;	/* vdso image in use */
 
 	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
-- 
cgit 


From 05ef76b20fc4297b0d3f8a956f1c809a8a1b3f1d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:22 -0800
Subject: x86/vdso: Use .fault for the vDSO text mapping

The old scheme for mapping the vDSO text is rather complicated.
vdso2c generates a struct vm_special_mapping and a blank .pages
array of the correct size for each vdso image.  Init code in
vdso/vma.c populates the .pages array for each vDSO image, and
the mapping code selects the appropriate struct
vm_special_mapping.

With .fault, we can use a less roundabout approach: vdso_fault()
just returns the appropriate page for the selected vDSO image.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f886954c186bafd74e1b967c8931d852ae199aa2.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vdso2c.h |  7 -------
 arch/x86/entry/vdso/vma.c    | 26 +++++++++++++++++++-------
 arch/x86/include/asm/vdso.h  |  3 ---
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987556ce..abe961c7c71c 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	}
 	fprintf(outfile, "\n};\n\n");
 
-	fprintf(outfile, "static struct page *pages[%lu];\n\n",
-		mapping_size / 4096);
-
 	fprintf(outfile, "const struct vdso_image %s = {\n", name);
 	fprintf(outfile, "\t.data = raw_data,\n");
 	fprintf(outfile, "\t.size = %lu,\n", mapping_size);
-	fprintf(outfile, "\t.text_mapping = {\n");
-	fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
-	fprintf(outfile, "\t\t.pages = pages,\n");
-	fprintf(outfile, "\t},\n");
 	if (alt_sec) {
 		fprintf(outfile, "\t.alt = %lu,\n",
 			(unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 80b021067bd6..eb50d7c1f161 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -27,13 +27,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
 
 void __init init_vdso_image(const struct vdso_image *image)
 {
-	int i;
-	int npages = (image->size) / PAGE_SIZE;
-
 	BUG_ON(image->size % PAGE_SIZE != 0);
-	for (i = 0; i < npages; i++)
-		image->text_mapping.pages[i] =
-			virt_to_page(image->data + i*PAGE_SIZE);
 
 	apply_alternatives((struct alt_instr *)(image->data + image->alt),
 			   (struct alt_instr *)(image->data + image->alt +
@@ -90,6 +84,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 #endif
 }
 
+static int vdso_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+	if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+	get_page(vmf->page);
+	return 0;
+}
+
+static const struct vm_special_mapping text_mapping = {
+	.name = "[vdso]",
+	.fault = vdso_fault,
+};
+
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
 	struct mm_struct *mm = current->mm;
@@ -131,7 +143,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 				       image->size,
 				       VM_READ|VM_EXEC|
 				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       &image->text_mapping);
+				       &text_mapping);
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index deabaf9759b6..43dc55be524e 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -13,9 +13,6 @@ struct vdso_image {
 	void *data;
 	unsigned long size;   /* Always a multiple of PAGE_SIZE */
 
-	/* text_mapping.pages is big enough for data/size page pointers */
-	struct vm_special_mapping text_mapping;
-
 	unsigned long alt, alt_len;
 
 	long sym_vvar_start;  /* Negative offset to the vvar area */
-- 
cgit 


From a48a7042613eb1524d18b7b1ed7d3a6b611fd21f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:23 -0800
Subject: x86/vdso: Use ->fault() instead of remap_pfn_range() for the vvar
 mapping

This is IMO much less ugly, and it also opens the door to
disallowing unprivileged userspace HPET access on systems with
usable TSCs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c19c2909e5ee3c3d8742f916586676bb7c40345f.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vma.c | 97 ++++++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index eb50d7c1f161..4b5461ba4f6b 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -102,18 +102,69 @@ static const struct vm_special_mapping text_mapping = {
 	.fault = vdso_fault,
 };
 
+static int vvar_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+	long sym_offset;
+	int ret = -EFAULT;
+
+	if (!image)
+		return VM_FAULT_SIGBUS;
+
+	sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+		image->sym_vvar_start;
+
+	/*
+	 * Sanity check: a symbol offset of zero means that the page
+	 * does not exist for this vdso image, not that the page is at
+	 * offset zero relative to the text mapping.  This should be
+	 * impossible here, because sym_offset should only be zero for
+	 * the page past the end of the vvar mapping.
+	 */
+	if (sym_offset == 0)
+		return VM_FAULT_SIGBUS;
+
+	if (sym_offset == image->sym_vvar_page) {
+		ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+	} else if (sym_offset == image->sym_hpet_page) {
+#ifdef CONFIG_HPET_TIMER
+		if (hpet_address) {
+			ret = vm_insert_pfn_prot(
+				vma,
+				(unsigned long)vmf->virtual_address,
+				hpet_address >> PAGE_SHIFT,
+				pgprot_noncached(PAGE_READONLY));
+		}
+#endif
+	} else if (sym_offset == image->sym_pvclock_page) {
+		struct pvclock_vsyscall_time_info *pvti =
+			pvclock_pvti_cpu0_va();
+		if (pvti) {
+			ret = vm_insert_pfn(
+				vma,
+				(unsigned long)vmf->virtual_address,
+				__pa(pvti) >> PAGE_SHIFT);
+		}
+	}
+
+	if (ret == 0 || ret == -EBUSY)
+		return VM_FAULT_NOPAGE;
+
+	return VM_FAULT_SIGBUS;
+}
+
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long addr, text_start;
 	int ret = 0;
-	static struct page *no_pages[] = {NULL};
-	static struct vm_special_mapping vvar_mapping = {
+	static const struct vm_special_mapping vvar_mapping = {
 		.name = "[vvar]",
-		.pages = no_pages,
+		.fault = vvar_fault,
 	};
-	struct pvclock_vsyscall_time_info *pvti;
 
 	if (calculate_addr) {
 		addr = vdso_addr(current->mm->start_stack,
@@ -153,7 +204,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 	vma = _install_special_mapping(mm,
 				       addr,
 				       -image->sym_vvar_start,
-				       VM_READ|VM_MAYREAD,
+				       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+				       VM_PFNMAP,
 				       &vvar_mapping);
 
 	if (IS_ERR(vma)) {
@@ -161,41 +213,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		goto up_fail;
 	}
 
-	if (image->sym_vvar_page)
-		ret = remap_pfn_range(vma,
-				      text_start + image->sym_vvar_page,
-				      __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-				      PAGE_SIZE,
-				      PAGE_READONLY);
-
-	if (ret)
-		goto up_fail;
-
-#ifdef CONFIG_HPET_TIMER
-	if (hpet_address && image->sym_hpet_page) {
-		ret = io_remap_pfn_range(vma,
-			text_start + image->sym_hpet_page,
-			hpet_address >> PAGE_SHIFT,
-			PAGE_SIZE,
-			pgprot_noncached(PAGE_READONLY));
-
-		if (ret)
-			goto up_fail;
-	}
-#endif
-
-	pvti = pvclock_pvti_cpu0_va();
-	if (pvti && image->sym_pvclock_page) {
-		ret = remap_pfn_range(vma,
-				      text_start + image->sym_pvclock_page,
-				      __pa(pvti) >> PAGE_SHIFT,
-				      PAGE_SIZE,
-				      PAGE_READONLY);
-
-		if (ret)
-			goto up_fail;
-	}
-
 up_fail:
 	if (ret)
 		current->mm->context.vdso = NULL;
-- 
cgit 


From bd902c536298830e4d126dcf6491b46d3f1bf96e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:24 -0800
Subject: x86/vdso: Disallow vvar access to vclock IO for never-used vclocks

It makes me uncomfortable that even modern systems grant every
process direct read access to the HPET.

While fixing this for real without regressing anything is a mess
(unmapping the HPET is tricky because we don't adequately track
all the mappings), we can do almost as well by tracking which
vclocks have ever been used and only allowing pages associated
with used vclocks to be faulted in.

This will cause rogue programs that try to peek at the HPET to
get SIGBUS instead on most systems.

We can't restrict faults to vclock pages that are associated
with the currently selected vclock due to a race: a process
could start to access the HPET for the first time and race
against a switch away from the HPET as the current clocksource.
We can't segfault the process trying to peek at the HPET in this
case, even though the process isn't going to do anything useful
with the data.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e79d06295625c02512277737ab55085a498ac5d8.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vma.c               | 4 ++--
 arch/x86/entry/vsyscall/vsyscall_gtod.c | 9 ++++++++-
 arch/x86/include/asm/clocksource.h      | 9 +++++----
 arch/x86/include/asm/vgtod.h            | 6 ++++++
 4 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 4b5461ba4f6b..7c912fefe79b 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -130,7 +130,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
 	} else if (sym_offset == image->sym_hpet_page) {
 #ifdef CONFIG_HPET_TIMER
-		if (hpet_address) {
+		if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
 			ret = vm_insert_pfn_prot(
 				vma,
 				(unsigned long)vmf->virtual_address,
@@ -141,7 +141,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 	} else if (sym_offset == image->sym_pvclock_page) {
 		struct pvclock_vsyscall_time_info *pvti =
 			pvclock_pvti_cpu0_va();
-		if (pvti) {
+		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
 			ret = vm_insert_pfn(
 				vma,
 				(unsigned long)vmf->virtual_address,
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..0fb3a104ac62 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
 
+int vclocks_used __read_mostly;
+
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
+	int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
 	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
 
+	/* Mark the new vclock used. */
+	BUILD_BUG_ON(VCLOCK_MAX >= 32);
+	WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
 	gtod_write_begin(vdata);
 
 	/* copy vsyscall data */
-	vdata->vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->vclock_mode	= vclock_mode;
 	vdata->cycle_last	= tk->tkr_mono.cycle_last;
 	vdata->mask		= tk->tkr_mono.mask;
 	vdata->mult		= tk->tkr_mono.mult;
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index eda81dc0f4ae..d194266acb28 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,10 +3,11 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#define VCLOCK_NONE 0  /* No vDSO clock available.	*/
-#define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
-#define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
-#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_NONE	0  /* No vDSO clock available.	*/
+#define VCLOCK_TSC	1  /* vDSO should use vread_tsc.	*/
+#define VCLOCK_HPET	2  /* vDSO should use vread_hpet.	*/
+#define VCLOCK_PVCLOCK	3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_MAX	3
 
 struct arch_clocksource_data {
 	int vclock_mode;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index f556c4843aa1..e728699db774 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -37,6 +37,12 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
+extern int vclocks_used;
+static inline bool vclock_was_used(int vclock)
+{
+	return READ_ONCE(vclocks_used) & (1 << vclock);
+}
+
 static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
 {
 	unsigned ret;
-- 
cgit 


From 2024315124b43bb8b25a119ec6c614f0647dcc6d Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Mon, 18 Jan 2016 20:13:14 +0600
Subject: x86/asm/entry: Remove unused SAVE_ALL/RESTORE_ALL macros for
 !CONFIG_x86_64

SAVE_ALL and RESTORE_ALL macros for !CONFIG_X86_64 were
introduced in commit:

  1a338ac32 commit ('sched, x86: Optimize the preempt_schedule() call')

... and were used in the ___preempt_schedule() and ___preempt_schedule_context()
functions from the arch/x86/kernel/preempt.S.

But the arch/x86/kernel/preempt.S file was removed in the following commit:

  0ad6e3c5 commit ('x86: Speed up ___preempt_schedule*() by using THUNK helpers')

The ___preempt_schedule()/___preempt_schedule_context() functions were
reimplemeted and do not use SAVE_ALL/RESTORE_ALL anymore.

These macros have no users anymore, so we can remove them.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453126394-13717-1-git-send-email-kuleshovmail@gmail.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e09868..9a9e5884066c 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.byte 0xf1
 	.endm
 
-#else /* CONFIG_X86_64 */
-
-/*
- * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
- * are different from the entry_32.S versions in not changing the segment
- * registers. So only suitable for in kernel use, not when transitioning
- * from or to user space. The resulting stack frame is not a standard
- * pt_regs frame. The main use case is calling C code from assembler
- * when all the registers need to be preserved.
- */
-
-	.macro SAVE_ALL
-	pushl %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	pushl %ecx
-	pushl %ebx
-	.endm
-
-	.macro RESTORE_ALL
-	popl %ebx
-	popl %ecx
-	popl %edx
-	popl %esi
-	popl %edi
-	popl %ebp
-	popl %eax
-	.endm
-
 #endif /* CONFIG_X86_64 */
 
 /*
-- 
cgit 


From a1ff5726081858a9ad98934eff7af6616c576875 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 16 Jan 2016 10:58:12 +0100
Subject: x86/cpufeature: Add AMD AVIC bit

CPUID Fn8000_000A_EDX[13] denotes support for AMD's Virtual
Interrupt controller, i.e., APIC virtualization.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Kaplan <david.kaplan@amd.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Link: http://lkml.kernel.org/r/1452938292-12327-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..bbf166e805be 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -268,6 +268,7 @@
 #define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
 
 /*
  * BUG word(s)
-- 
cgit 


From 43c75f933be26422f166d6d869a19997312f4732 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjennings@variantweb.net>
Date: Mon, 30 Nov 2015 10:47:43 -0600
Subject: x86/mm: Streamline and restore probe_memory_block_size()

The cumulative effect of the following two commits:

  bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems")
  982792c782ef ("x86, mm: probe memory block size for generic x86 64bit")

... is some pretty convoluted code.

The first commit also removed code for the UV case without stated reason,
which might lead to unexpected change in behavior.

This commit has no other (intended) functional change; just seeks to simplify
and make the code more understandable, beyond restoring the UV behavior.

The whole section with the "tail size" doesn't seem to be
reachable, since both the >= 64GB and < 64GB case return, so it
was removed.

Signed-off-by: Seth Jennings <sjennings@variantweb.net>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1448902063-18885-1-git-send-email-sjennings@variantweb.net
[ Rewrote the title and changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init_64.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8829482d69ec..8f18fec74e67 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,6 +52,7 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <asm/uv/uv.h>
 #include <asm/setup.h>
 
 #include "mm_internal.h"
@@ -1193,26 +1194,13 @@ int kern_addr_valid(unsigned long addr)
 
 static unsigned long probe_memory_block_size(void)
 {
-	/* start from 2g */
-	unsigned long bz = 1UL<<31;
+	unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
 
-	if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
-		pr_info("Using 2GB memory block size for large-memory system\n");
-		return 2UL * 1024 * 1024 * 1024;
-	}
-
-	/* less than 64g installed */
-	if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
-		return MIN_MEMORY_BLOCK_SIZE;
-
-	/* get the tail size */
-	while (bz > MIN_MEMORY_BLOCK_SIZE) {
-		if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
-			break;
-		bz >>= 1;
-	}
+	/* if system is UV or has 64GB of RAM or more, use large blocks */
+	if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
+		bz = 2UL << 30; /* 2GB */
 
-	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+	pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
 
 	return bz;
 }
-- 
cgit 


From 95d97adb2bb85d964bae4538e0574e742e522dda Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 17 Dec 2015 23:56:52 +0000
Subject: x86/signal: Cleanup get_nr_restart_syscall()

Check for TS_COMPAT instead of TIF_IA32 to distinguish ia32
tasks from 64-bit tasks.

Check for __X32_SYSCALL_BIT iff CONFIG_X86_X32_ABI is defined.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Elvira Khabirova <lineprinter0@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160111145515.GB29007@altlinux.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/signal.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cb6282c3638f..c07ff5ddbd47 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -692,12 +692,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
-#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_64
+	if (is_ia32_task())
+		return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+	return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
 	return __NR_restart_syscall;
-#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
-	return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
-		__NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
-#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+#endif
 }
 
 /*
-- 
cgit 


From 997963edd912a6d77d68b2bbc19f40ce8facabd7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 18 Dec 2015 06:39:18 -0600
Subject: x86/asm: Clean up frame pointer macros

The asm macros for setting up and restoring the frame pointer
aren't currently being used.  However, they will be needed soon
to help asm functions to comply with stacktool.

Rename FRAME/ENDFRAME to FRAME_BEGIN/FRAME_END for more
symmetry.  Also make the code more readable and improve the
comments.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3f488a8e3bfc8ac7d4d3d350953e664e7182b044.1450442274.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/frame.h | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 793179cf8e21..cec213bd6eba 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,23 +1,34 @@
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
+
 #ifdef __ASSEMBLY__
 
 #include <asm/asm.h>
 
-/* The annotation hides the frame from the unwinder and makes it look
-   like a ordinary ebp save/restore. This avoids some special cases for
-   frame pointer later */
+/*
+ * These are stack frame creation macros.  They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
 #ifdef CONFIG_FRAME_POINTER
-	.macro FRAME
-	__ASM_SIZE(push,)	%__ASM_REG(bp)
-	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
-	.endm
-	.macro ENDFRAME
-	__ASM_SIZE(pop,)	%__ASM_REG(bp)
-	.endm
-#else
-	.macro FRAME
-	.endm
-	.macro ENDFRAME
-	.endm
-#endif
+
+.macro FRAME_BEGIN
+	push %_ASM_BP
+	_ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+	pop %_ASM_BP
+.endm
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
 
 #endif  /*  __ASSEMBLY__  */
+#endif /* _ASM_X86_FRAME_H */
-- 
cgit 


From ec5186557abbe711dfd34e1863735dfecb0602cc Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 18 Dec 2015 06:39:19 -0600
Subject: x86/asm: Add C versions of frame pointer macros

Add C versions of the frame pointer macros which can be used to
create a stack frame in inline assembly.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f6786a282bf232ede3e2866414eae3cf02c7d662.1450442274.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/frame.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index cec213bd6eba..6e4d170726b7 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,16 +1,17 @@
 #ifndef _ASM_X86_FRAME_H
 #define _ASM_X86_FRAME_H
 
-#ifdef __ASSEMBLY__
-
 #include <asm/asm.h>
 
 /*
  * These are stack frame creation macros.  They should be used by every
  * callable non-leaf asm function to make kernel stack traces more reliable.
  */
+
 #ifdef CONFIG_FRAME_POINTER
 
+#ifdef __ASSEMBLY__
+
 .macro FRAME_BEGIN
 	push %_ASM_BP
 	_ASM_MOV %_ASM_SP, %_ASM_BP
@@ -20,6 +21,16 @@
 	pop %_ASM_BP
 .endm
 
+#else /* !__ASSEMBLY__ */
+
+#define FRAME_BEGIN				\
+	"push %" _ASM_BP "\n"			\
+	_ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#endif /* __ASSEMBLY__ */
+
 #define FRAME_OFFSET __ASM_SEL(4, 8)
 
 #else /* !CONFIG_FRAME_POINTER */
@@ -30,5 +41,4 @@
 
 #endif /* CONFIG_FRAME_POINTER */
 
-#endif  /*  __ASSEMBLY__  */
 #endif /* _ASM_X86_FRAME_H */
-- 
cgit 


From 320d25b6a05f8b73c23fc21025d2906ecdd2d4fc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 19 Jan 2016 13:38:58 -0800
Subject: x86/mm/32: Set NX in __supported_pte_mask before enabling paging

There's a short window in which very early mappings can end up
with NX clear because they are created before we've noticed that
we have NX.

It turns out that we detect NX very early, so there's no need to
defer __supported_pte_mask setup.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2b544627345f7110160545a3f47031eb45c3ad4f.1453239349.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head_32.S | 6 ++++++
 arch/x86/mm/setup_nx.c    | 5 ++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae24b6d2..57fc3f8c85fd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -389,6 +389,12 @@ default_entry:
 	/* Make changes effective */
 	wrmsr
 
+	/*
+	 * And make sure that all the mappings we set up have NX set from
+	 * the beginning.
+	 */
+	orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
+
 enable_paging:
 
 /*
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 92e2eacb3321..78f5d5907f98 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -31,9 +31,8 @@ early_param("noexec", noexec_setup);
 
 void x86_configure_nx(void)
 {
-	if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
-		__supported_pte_mask |= _PAGE_NX;
-	else
+	/* If disable_nx is set, clear NX on all new mappings going forward. */
+	if (disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
-- 
cgit 


From 7c360572b430a0e9757bafc0c20f26c920f2a07f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 19 Jan 2016 13:38:59 -0800
Subject: x86/mm: Make kmap_prot into a #define

The value (once we initialize it) is a foregone conclusion.
Make it a #define to save a tiny amount of text and data size
and to make it more comprehensible.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0850eb0213de9da88544ff7fae72dc6d06d2b441.1453239349.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fixmap.h | 2 +-
 arch/x86/mm/init_32.c         | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6d7d0e52ed5a..8554f960e21b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
 extern int fixmaps_set;
 
 extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
+#define kmap_prot PAGE_KERNEL
 extern pte_t *pkmap_page_table;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3de61f9..a4bb1c7ab65e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -388,7 +388,6 @@ repeat:
 }
 
 pte_t *kmap_pte;
-pgprot_t kmap_prot;
 
 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
 {
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
 	 */
 	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
 	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-	kmap_prot = PAGE_KERNEL;
 }
 
 #ifdef CONFIG_HIGHMEM
-- 
cgit 


From 753b11ef8e92a1c1bbe97f2a5ec14bdd1ef2e6fe Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Thu, 21 Jan 2016 14:11:59 +0000
Subject: x86/efi: Setup separate EFI page tables in kexec paths

The switch to using a new dedicated page table for EFI runtime
calls in commit commit 67a9108ed431 ("x86/efi: Build our own
page table structures") failed to take into account changes
required for the kexec code paths, which are unfortunately
duplicated in the EFI code.

Call the allocation and setup functions in
kexec_enter_virtual_mode() just like we do for
__efi_enter_virtual_mode() to avoid hitting NULL-pointer
dereferences when making EFI runtime calls.

At the very least, the call to efi_setup_page_tables() should
have existed for kexec before the following commit:

  67a9108ed431 ("x86/efi: Build our own page table structures")

Things just magically worked because we were actually using
the kernel's page tables that contained the required mappings.

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453385519-11477-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3c1f3cd7b2ba..bdd9477f937c 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -815,6 +815,7 @@ static void __init kexec_enter_virtual_mode(void)
 {
 #ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
+	unsigned int num_pages;
 	void *p;
 
 	efi.systab = NULL;
@@ -829,6 +830,12 @@ static void __init kexec_enter_virtual_mode(void)
 		return;
 	}
 
+	if (efi_alloc_page_tables()) {
+		pr_err("Failed to allocate EFI page tables\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
+
 	/*
 	* Map efi regions which were passed via setup_data. The virt_addr is a
 	* fixed addr which was used in first kernel of a kexec boot.
@@ -843,6 +850,14 @@ static void __init kexec_enter_virtual_mode(void)
 
 	BUG_ON(!efi.systab);
 
+	num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE);
+	num_pages >>= PAGE_SHIFT;
+
+	if (efi_setup_page_tables(memmap.phys_map, num_pages)) {
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
+
 	efi_sync_low_kernel_mappings();
 
 	/*
-- 
cgit 


From 14365449b6ce34cf6a3040ff8ebbb39d89d67159 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 26 Jan 2016 18:21:21 +0600
Subject: x86/asm: Remove unused L3_PAGE_OFFSET

L3_PAGE_OFFSET was introduced in commit a6523748bd (paravirt/x86, 64-bit: move
__PAGE_OFFSET to leave a space for hypervisor), but has no users.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Link: http://lkml.kernel.org/r/1453810881-30622-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_64.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..2e974680f5ad 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,6 @@
 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 
 L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
 L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
-- 
cgit 


From 10cff58c6772587f9df422dc1dfaca936c66be1f Mon Sep 17 00:00:00 2001
From: Megha Dey <megha.dey@linux.intel.com>
Date: Mon, 25 Jan 2016 16:46:09 -0800
Subject: crypto: sha1-mb - Add missing args_digest offset

The _args_digest is defined as _args+_digest, both of which are the first
members of 2 separate structures, effectively yielding _args_digest to have
a value of zero. Thus, no errors have spawned yet due to this. To ensure
sanity, adding the missing _args_digest offset to the sha1_mb_mgr_submit.S.

Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Acked-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index 2ab9560b53c8..c420d89b175f 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -197,7 +197,7 @@ len_is_0:
 	vpinsrd  $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
 	vpinsrd  $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
 	vpinsrd  $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
-	movl    4*32(state, idx, 4), DWORD_tmp
+	movl     _args_digest+4*32(state, idx, 4), DWORD_tmp
 
 	vmovdqu  %xmm0, _result_digest(job_rax)
 	movl    DWORD_tmp, _result_digest+1*16(job_rax)
-- 
cgit 


From bd922477d9350a3006d73dabb241400e6c4181b0 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:29 +0200
Subject: locking/x86: Add cc clobber for ADDL

ADDL clobbers flags (such as CF) but barrier.h didn't tell this
to GCC. Historically, GCC doesn't need one on x86, and always
considers flags clobbered. We are probably missing the cc
clobber in a *lot* of places for this reason.

But even if not necessary, it's probably a good thing to add for
documentation, and in case GCC semantcs ever change.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-2-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/barrier.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a584e1c50918..a65bdb10246a 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -15,9 +15,12 @@
  * Some non-Intel clones support out of order store. wmb() ceases to be a
  * nop for these.
  */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+				      X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+				       X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+				       X86_FEATURE_XMM2) ::: "memory", "cc")
 #else
 #define mb() 	asm volatile("mfence":::"memory")
 #define rmb()	asm volatile("lfence":::"memory")
-- 
cgit 


From e37cee133c72c9529f74a20d9b7eb3b6dfb928b5 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:37 +0200
Subject: locking/x86: Drop a comment left over from X86_OOSTORE

The comment about wmb being non-NOP to deal with non-Intel CPUs
is a left over from before the following commit:

  09df7c4c8097 ("x86: Remove CONFIG_X86_OOSTORE")

It makes no sense now: in particular, wmb() is not a NOP even for
regular Intel CPUs because of weird use-cases e.g. dealing with
WC memory.

Drop this comment.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-3-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/barrier.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a65bdb10246a..a29174599a98 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -11,10 +11,6 @@
  */
 
 #ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
 #define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
 				      X86_FEATURE_XMM2) ::: "memory", "cc")
 #define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
-- 
cgit 


From 57d9b1b43433a6ba7267c80b87d8e8f6e86edceb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:44 +0200
Subject: locking/x86: Tweak the comment about use of wmb() for IO

On x86, we *do* still use the non-NOP rmb()/wmb() for IO barriers,
but even that is generally questionable.

Leave them around as historial unless somebody can point to a
case where they care about the performance, but tweak the
comment so people don't think they are strictly required in all
cases.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-4-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/barrier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a29174599a98..bfb28caf97b1 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -6,7 +6,7 @@
 
 /*
  * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
  * to devices.
  */
 
-- 
cgit 


From ca59809ff6d572ae58fc6bedf7500f5a60fdbd64 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:51 +0200
Subject: locking/x86: Use mb() around clflush()

The following commit:

  f8e617f4582995f ("sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs")

adds memory barriers around clflush(), but this seems wrong for UP since
barrier() has no effect on clflush().  We really want MFENCE, so switch
to mb() instead.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-5-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..9decee2bfdbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -418,9 +418,9 @@ static void mwait_idle(void)
 	if (!current_set_polling_and_test()) {
 		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
-			smp_mb(); /* quirk */
+			mb(); /* quirk */
 			clflush((void *)&current_thread_info()->flags);
-			smp_mb(); /* quirk */
+			mb(); /* quirk */
 		}
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
-- 
cgit 


From fba324744bfd2a7948a7710d7a021d76dafb9b67 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:21 -0800
Subject: x86/syscalls: Refactor syscalltbl.sh

This splits out the code to emit a syscall line.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1bfcbba991f5cfaa9291ff950a593daa972a205f.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscalls/syscalltbl.sh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..167965ee742e 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,21 @@
 in="$1"
 out="$2"
 
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+    if [ -n "$compat" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+    elif [ -n "$entry" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+    fi
+}
+
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	if [ -n "$compat" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-	elif [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
-	fi
+	emit "$abi" "$nr" "$entry" "$compat"
     done
 ) > "$out"
-- 
cgit 


From 32324ce15ea8cb4c8acc28acb2fd36fabf73e9db Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:22 -0800
Subject: x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32

The common/64/x32 distinction has no effect other than
determining which kernels actually support the syscall.  Move
the logic into syscalltbl.sh.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/58d4a95f40e43b894f93288b4a3633963d0ee22e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscall_64.c           |  8 --------
 arch/x86/entry/syscalls/syscalltbl.sh | 17 ++++++++++++++++-
 arch/x86/kernel/asm-offsets_64.c      |  6 ------
 arch/x86/um/sys_call_table_64.c       |  3 ---
 arch/x86/um/user-offsets.c            |  2 --
 5 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..974fd89ac806 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,14 +6,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 167965ee742e..5ebeaf1041e7 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -18,6 +18,21 @@ emit() {
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	emit "$abi" "$nr" "$entry" "$compat"
+	if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+	    # COMMON is the same as 64, except that we don't expect X32
+	    # programs to use it.  Our expectation has nothing to do with
+	    # any generated code, so treat them the same.
+	    emit 64 "$nr" "$entry" "$compat"
+	elif [ "$abi" == "X32" ]; then
+	    # X32 is equivalent to 64 on an X32-compatible kernel.
+	    echo "#ifdef CONFIG_X86_X32_ABI"
+	    emit 64 "$nr" "$entry" "$compat"
+	    echo "#endif"
+	elif [ "$abi" == "I386" ]; then
+	    emit "$abi" "$nr" "$entry" "$compat"
+	else
+	    echo "Unknown abi $abi" >&2
+	    exit 1
+	fi
     done
 ) > "$out"
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f2edafb5f24e..29db3b3f550c 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,12 +5,6 @@
 #include <asm/ia32.h>
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b74ea6c2c0e7..71a497cde921 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,9 +35,6 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ce7e3607a870..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -15,8 +15,6 @@ static char syscalls[] = {
 };
 #else
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit 


From 3e65654e3db6df6aba9c5b895f8b8e6a8d8eb508 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:23 -0800
Subject: x86/syscalls: Move compat syscall entry handling into syscalltbl.sh

Rather than duplicating the compat entry handling in all
consumers of syscalls_BITS.h, handle it directly in
syscalltbl.sh.  Now we generate entries in syscalls_32.h like:

__SYSCALL_I386(5, sys_open)
__SYSCALL_I386(5, compat_sys_open)

and all of its consumers implicitly get the right entry point.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b7c2b501dc0e6e43050e916b95807c3e2e16e9bb.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscall_32.c           | 10 ++--------
 arch/x86/entry/syscall_64.c           |  4 ++--
 arch/x86/entry/syscalls/syscalltbl.sh | 22 ++++++++++++++++++----
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/asm-offsets_64.c      |  4 ++--
 arch/x86/um/sys_call_table_32.c       |  4 ++--
 arch/x86/um/sys_call_table_64.c       |  4 ++--
 arch/x86/um/user-offsets.c            |  4 ++--
 8 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..3e2829759da2 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 974fd89ac806..3781989b180e 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 5ebeaf1041e7..b81479c8c5fb 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -8,10 +8,24 @@ emit() {
     nr="$2"
     entry="$3"
     compat="$4"
-    if [ -n "$compat" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-    elif [ -n "$entry" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+	echo "a compat entry for a 64-bit syscall makes no sense" >&2
+	exit 1
+    fi
+
+    if [ -z "$compat" ]; then
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+    else
+	echo "#ifdef CONFIG_X86_32"
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+	echo "#else"
+	echo "__SYSCALL_${abi}($nr, $compat)"
+	echo "#endif"
     fi
 }
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6ce39025f467..abec4c9f1c97 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 29db3b3f550c..9677bf9a616f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 439c0994b696..d4669a679fd0 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 71a497cde921..6ee5268beb05 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 5edf4f4bbf53..6c9a9c1eae32 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit 


From cfcbadb49dabb05efa23e1a0f95f3391c0a815bc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:24 -0800
Subject: x86/syscalls: Add syscall entry qualifiers

This will let us specify something like 'sys_xyz/foo' instead of
'sys_xyz' in the syscall table, where the 'foo' qualifier conveys
some extra information to the C code.

The intent is to allow things like sys_execve/ptregs to indicate
that sys_execve() touches pt_regs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2de06e33dce62556b3ec662006fcb295504e296e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscall_32.c           |  4 ++--
 arch/x86/entry/syscall_64.c           |  4 ++--
 arch/x86/entry/syscalls/syscalltbl.sh | 19 ++++++++++++++++---
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/asm-offsets_64.c      |  4 ++--
 arch/x86/um/sys_call_table_32.c       |  4 ++--
 arch/x86/um/sys_call_table_64.c       |  4 ++--
 arch/x86/um/user-offsets.c            |  4 ++--
 8 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3e2829759da2..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 3781989b180e..a1d408772ae6 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index b81479c8c5fb..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,6 +3,19 @@
 in="$1"
 out="$2"
 
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"		# Strip the function name
+    qualifier="${qualifier:1}"			# Strip the slash, if any
+
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
 emit() {
     abi="$1"
     nr="$2"
@@ -16,15 +29,15 @@ emit() {
 
     if [ -z "$compat" ]; then
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
     else
 	echo "#ifdef CONFIG_X86_32"
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
 	echo "#else"
-	echo "__SYSCALL_${abi}($nr, $compat)"
+	syscall_macro "$abi" "$nr" "$compat"
 	echo "#endif"
     fi
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index abec4c9f1c97..fdeb0ce07c16 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 9677bf9a616f..d875f97d4e0b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index d4669a679fd0..bfce503dffae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 6ee5268beb05..f306413d3eb6 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 6c9a9c1eae32..470564bbd08e 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit 


From 302f5b260c322696cbeb962a263a4d2d99864aed Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:25 -0800
Subject: x86/entry/64: Always run ptregs-using syscalls on the slow path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

64-bit syscalls currently have an optimization in which they are
called with partial pt_regs.  A small handful require full
pt_regs.

In the 32-bit and compat cases, I cleaned this up by forcing
full pt_regs for all syscalls.  The performance hit doesn't
really matter as the affected system calls are fundamentally
heavy and this is the 32-bit compat case.

I want to clean up the 64-bit case as well, but I don't want to
hurt fast path performance.  To do that, I want to force the
syscalls that use pt_regs onto the slow path.  This will enable
us to make slow path syscalls be real ABI-compliant C functions.

Use the new syscall entry qualification machinery for this.
'stub_clone' is now 'stub_clone/ptregs'.

The next patch will eliminate the stubs, and we'll just have
'sys_clone/ptregs'.

As of this patch, two-phase entry tracing is no longer used.  It
has served its purpose (namely a huge speedup on some workloads
prior to more general opportunistic SYSRET support), and once
the dust settles I'll send patches to back it out.

The implementation is heavily based on a patch from Brian Gerst:

  http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com

Originally-From: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S              | 56 +++++++++++++++++++++++++---------
 arch/x86/entry/syscall_64.c            |  7 +++--
 arch/x86/entry/syscalls/syscall_64.tbl | 16 +++++-----
 3 files changed, 55 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3cfceb6..f1c8f150728e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -182,7 +182,15 @@ entry_SYSCALL_64_fastpath:
 #endif
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx
+
+	/*
+	 * This call instruction is handled specially in stub_ptregs_64.
+	 * It might end up jumping to the slow path.  If it jumps, RAX is
+	 * clobbered.
+	 */
 	call	*sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
 	movq	%rax, RAX(%rsp)
 1:
 /*
@@ -235,25 +243,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
 	/* Do syscall entry tracing */
 tracesys:
-	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	call	syscall_trace_enter_phase1
-	test	%rax, %rax
-	jnz	tracesys_phase2			/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
-	movq	ORIG_RAX(%rsp), %rax
-	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
-
-tracesys_phase2:
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	movq	%rax, %rdx
-	call	syscall_trace_enter_phase2
+	call	syscall_trace_enter
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_entry_phase2() returned
+	 * We don't reload %rax because syscall_trace_enter() returned
 	 * the value it wants us to use in the table lookup.
 	 */
 	RESTORE_C_REGS_EXCEPT_RAX
@@ -355,6 +351,38 @@ opportunistic_sysret_failed:
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
+ENTRY(stub_ptregs_64)
+	/*
+	 * Syscalls marked as needing ptregs land here.
+	 * If we are on the fast path, we need to save the extra regs.
+	 * If we are on the slow path, the extra regs are already saved.
+	 *
+	 * RAX stores a pointer to the C function implementing the syscall.
+	 */
+	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+	jne	1f
+
+	/* Called from fast path -- pop return address and jump to slow path */
+	popq	%rax
+	jmp	tracesys	/* called from fast path */
+
+1:
+	/* Called from C */
+	jmp	*%rax				/* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+	leaq	\func(%rip), %rax
+	jmp	stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index a1d408772ae6..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a50bdc..5de342a729d0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
 14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		stub_rt_sigreturn
+15	64	rt_sigreturn		stub_rt_sigreturn/ptregs
 16	64	ioctl			sys_ioctl
 17	common	pread64			sys_pread64
 18	common	pwrite64		sys_pwrite64
@@ -62,10 +62,10 @@
 53	common	socketpair		sys_socketpair
 54	64	setsockopt		sys_setsockopt
 55	64	getsockopt		sys_getsockopt
-56	common	clone			stub_clone
-57	common	fork			stub_fork
-58	common	vfork			stub_vfork
-59	64	execve			stub_execve
+56	common	clone			stub_clone/ptregs
+57	common	fork			stub_fork/ptregs
+58	common	vfork			stub_vfork/ptregs
+59	64	execve			stub_execve/ptregs
 60	common	exit			sys_exit
 61	common	wait4			sys_wait4
 62	common	kill			sys_kill
@@ -328,7 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
-322	64	execveat		stub_execveat
+322	64	execveat		stub_execveat/ptregs
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
@@ -346,7 +346,7 @@
 517	x32	recvfrom		compat_sys_recvfrom
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			stub_x32_execve
+520	x32	execve			stub_x32_execve/ptregs
 521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		compat_sys_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		stub_x32_execveat
+545	x32	execveat		stub_x32_execveat/ptregs
-- 
cgit 


From 46eabf06c04a6847a694a0c1413d4ac57e5b058a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:26 -0800
Subject: x86/entry/64: Call all native slow-path syscalls with full pt-regs

This removes all of the remaining asm syscall stubs except for
stub_ptregs_64.  Entries in the main syscall table are now all
callable from C.

The resulting asm is every bit as ridiculous as it looks.  The
next few patches will clean it up.  This patch is here to let
reviewers rest their brains and for bisection.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a6b3801be0d505d50aefabda02d3b93efbfc9c73.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S              | 79 +---------------------------------
 arch/x86/entry/syscalls/syscall_64.tbl | 18 ++++----
 2 files changed, 10 insertions(+), 87 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f1c8f150728e..f7050a5d9dbc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -253,7 +253,6 @@ tracesys:
 	 * the value it wants us to use in the table lookup.
 	 */
 	RESTORE_C_REGS_EXCEPT_RAX
-	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
 #else
@@ -264,6 +263,7 @@ tracesys:
 	movq	%r10, %rcx			/* fixup for C */
 	call	*sys_call_table(, %rax, 8)
 	movq	%rax, RAX(%rsp)
+	RESTORE_EXTRA_REGS
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
@@ -384,83 +384,6 @@ END(ptregs_\func)
 #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
 #include <asm/syscalls_64.h>
 
-	.macro FORK_LIKE func
-ENTRY(stub_\func)
-	SAVE_EXTRA_REGS 8
-	jmp	sys_\func
-END(stub_\func)
-	.endm
-
-	FORK_LIKE  clone
-	FORK_LIKE  fork
-	FORK_LIKE  vfork
-
-ENTRY(stub_execve)
-	call	sys_execve
-return_from_execve:
-	testl	%eax, %eax
-	jz	1f
-	/* exec failed, can use fast SYSRET code path in this case */
-	ret
-1:
-	/* must use IRET code path (pt_regs->cs may have changed) */
-	addq	$8, %rsp
-	ZERO_EXTRA_REGS
-	movq	%rax, RAX(%rsp)
-	jmp	int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
-	.align	8
-GLOBAL(stub_execveat)
-	call	sys_execveat
-	jmp	return_from_execve
-END(stub_execveat)
-
-#if defined(CONFIG_X86_X32_ABI)
-	.align	8
-GLOBAL(stub_x32_execve)
-	call	compat_sys_execve
-	jmp	return_from_execve
-END(stub_x32_execve)
-	.align	8
-GLOBAL(stub_x32_execveat)
-	call	compat_sys_execveat
-	jmp	return_from_execve
-END(stub_x32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
-	/*
-	 * SAVE_EXTRA_REGS result is not normally needed:
-	 * sigreturn overwrites all pt_regs->GPREGS.
-	 * But sigreturn can fail (!), and there is no easy way to detect that.
-	 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
-	 * we SAVE_EXTRA_REGS here.
-	 */
-	SAVE_EXTRA_REGS 8
-	call	sys_rt_sigreturn
-return_from_stub:
-	addq	$8, %rsp
-	RESTORE_EXTRA_REGS
-	movq	%rax, RAX(%rsp)
-	jmp	int_ret_from_sys_call
-END(stub_rt_sigreturn)
-
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
-	SAVE_EXTRA_REGS 8
-	call	sys32_x32_rt_sigreturn
-	jmp	return_from_stub
-END(stub_x32_rt_sigreturn)
-#endif
-
 /*
  * A newly forked process directly context switches into this address.
  *
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5de342a729d0..dcf107ce2cd4 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
 14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		stub_rt_sigreturn/ptregs
+15	64	rt_sigreturn		sys_rt_sigreturn/ptregs
 16	64	ioctl			sys_ioctl
 17	common	pread64			sys_pread64
 18	common	pwrite64		sys_pwrite64
@@ -62,10 +62,10 @@
 53	common	socketpair		sys_socketpair
 54	64	setsockopt		sys_setsockopt
 55	64	getsockopt		sys_getsockopt
-56	common	clone			stub_clone/ptregs
-57	common	fork			stub_fork/ptregs
-58	common	vfork			stub_vfork/ptregs
-59	64	execve			stub_execve/ptregs
+56	common	clone			sys_clone/ptregs
+57	common	fork			sys_fork/ptregs
+58	common	vfork			sys_vfork/ptregs
+59	64	execve			sys_execve/ptregs
 60	common	exit			sys_exit
 61	common	wait4			sys_wait4
 62	common	kill			sys_kill
@@ -328,7 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
-322	64	execveat		stub_execveat/ptregs
+322	64	execveat		sys_execveat/ptregs
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
@@ -339,14 +339,14 @@
 # for native 64-bit operation.
 #
 512	x32	rt_sigaction		compat_sys_rt_sigaction
-513	x32	rt_sigreturn		stub_x32_rt_sigreturn
+513	x32	rt_sigreturn		sys32_x32_rt_sigreturn
 514	x32	ioctl			compat_sys_ioctl
 515	x32	readv			compat_sys_readv
 516	x32	writev			compat_sys_writev
 517	x32	recvfrom		compat_sys_recvfrom
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			stub_x32_execve/ptregs
+520	x32	execve			compat_sys_execve/ptregs
 521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		compat_sys_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		stub_x32_execveat/ptregs
+545	x32	execveat		compat_sys_execveat/ptregs
-- 
cgit 


From 24d978b76ffd20ecff8a8d1c21b16fe740f8b119 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:27 -0800
Subject: x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork

ret_from_fork is now open-coded and is no longer tangled up with
the syscall code.  This isn't so bad -- this adds very little
code, and IMO the result is much easier to understand.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a0747e2a5e47084655a1e96351c545b755c41fa7.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f7050a5d9dbc..cb5d940a7abd 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -390,7 +390,6 @@ END(ptregs_\func)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-
 	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
 	pushq	$0x0002
@@ -398,28 +397,32 @@ ENTRY(ret_from_fork)
 
 	call	schedule_tail			/* rdi: 'prev' task parameter */
 
-	RESTORE_EXTRA_REGS
-
 	testb	$3, CS(%rsp)			/* from kernel_thread? */
+	jnz	1f
 
 	/*
-	 * By the time we get here, we have no idea whether our pt_regs,
-	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the 32-bit compat paths.
-	 * Use IRET code path to return, since it can safely handle
-	 * all of the above.
+	 * We came from kernel_thread.  This code path is quite twisted, and
+	 * someone should clean it up.
+	 *
+	 * copy_thread_tls stashes the function pointer in RBX and the
+	 * parameter to be passed in RBP.  The called function is permitted
+	 * to call do_execve and thereby jump to user mode.
 	 */
-	jnz	int_ret_from_sys_call
+	movq	RBP(%rsp), %rdi
+	call	*RBX(%rsp)
+	movl	$0, RAX(%rsp)
 
 	/*
-	 * We came from kernel_thread
-	 * nb: we depend on RESTORE_EXTRA_REGS above
+	 * Fall through as though we're exiting a syscall.  This makes a
+	 * twisted sort of sense if we just called do_execve.
 	 */
-	movq	%rbp, %rdi
-	call	*%rbx
-	movl	$0, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp	int_ret_from_sys_call
+
+1:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
+	SWAPGS
+	jmp	restore_regs_and_iret
 END(ret_from_fork)
 
 /*
-- 
cgit 


From 1e423bff959e48166f5b7efca01fdb0dbdf05846 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:28 -0800
Subject: x86/entry/64: Migrate the 64-bit syscall slow path to C

This is more complicated than the 32-bit and compat cases
because it preserves an asm fast path for the case where the
callee-saved regs aren't needed in pt_regs and no entry or exit
work needs to be done.

This appears to slow down fastpath syscalls by no more than one
cycle on my Skylake laptop.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c   |  26 +++++++++++
 arch/x86/entry/entry_64.S | 117 ++++++++++++++++------------------------------
 2 files changed, 65 insertions(+), 78 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03663740c866..75175f92f462 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 	prepare_exit_to_usermode(regs);
 }
 
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	unsigned long nr = regs->orig_ax;
+
+	local_irq_enable();
+
+	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+		nr = syscall_trace_enter(regs);
+
+	/*
+	 * NB: Native and x32 syscalls are dispatched from the same
+	 * table.  The only functional difference is the x32 bit in
+	 * regs->orig_ax, which changes the behavior of some syscalls.
+	 */
+	if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+		regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+			regs->di, regs->si, regs->dx,
+			regs->r10, regs->r8, regs->r9);
+	}
+
+	syscall_return_slowpath(regs);
+}
+#endif
+
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
  * Does a 32-bit syscall.  Called with IRQs on and does all entry and
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cb5d940a7abd..567aa522ac0a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+	TRACE_IRQS_OFF
+
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
-	/*
-	 * Re-enable interrupts.
-	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-	 * must execute atomically in the face of possible interrupt-driven
-	 * task preemption. We must enable interrupts only after we're done
-	 * with using rsp_scratch:
-	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
 
-	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	tracesys
+	/*
+	 * If we need to do entry work or if we guess we'll need to do
+	 * exit work, go straight to the slow path.
+	 */
+	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	entry_SYSCALL64_slow_path
+
 entry_SYSCALL_64_fastpath:
+	/*
+	 * Easy case: enable interrupts and issue the syscall.  If the syscall
+	 * needs pt_regs, we'll call a stub that disables interrupts again
+	 * and jumps to the slow path.
+	 */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
 #else
@@ -193,88 +199,43 @@ entry_SYSCALL_64_fastpath:
 
 	movq	%rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-	LOCKDEP_SYS_EXIT
-	/*
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 
 	/*
-	 * We must check ti flags with interrupts (or at least preemption)
-	 * off because we must *never* return to userspace without
-	 * processing exit work that is enqueued if we're preempted here.
-	 * In particular, returning to userspace with any of the one-shot
-	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-	 * very bad.
+	 * If we get here, then we know that pt_regs is clean for SYSRET64.
+	 * If we see that no exit work is required (which we are required
+	 * to check with IRQs off), then we can go straight to SYSRET64.
 	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
+	jnz	1f
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RIP(%rsp), %rcx
-	movq	EFLAGS(%rsp), %r11
+	LOCKDEP_SYS_EXIT
+	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
+	RESTORE_C_REGS
 	movq	RSP(%rsp), %rsp
-	/*
-	 * 64-bit SYSRET restores rip from rcx,
-	 * rflags from r11 (but RF and VM bits are forced to 0),
-	 * cs and ss are loaded from MSRs.
-	 * Restoration of rflags re-enables interrupts.
-	 *
-	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-	 * descriptor is not reinitialized.  This means that we should
-	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
-	 * exit the kernel, and re-enter using an interrupt vector.  (All
-	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-	 * from happening by reloading SS in __switch_to.  (Actually
-	 * detecting the failure in 64-bit userspace is tricky but can be
-	 * done.)
-	 */
 	USERGS_SYSRET64
 
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+	/*
+	 * The fast path looked good when we started, but something changed
+	 * along the way and we need to switch to the slow path.  Calling
+	 * raise(3) will trigger this, for example.  IRQs are off.
+	 */
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	jmp int_ret_from_sys_call
-
-	/* Do syscall entry tracing */
-tracesys:
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	call	syscall_trace_enter
-
-	/*
-	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_enter() returned
-	 * the value it wants us to use in the table lookup.
-	 */
-	RESTORE_C_REGS_EXCEPT_RAX
-#if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
-#else
-	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
-#endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
-	movq	%r10, %rcx			/* fixup for C */
-	call	*sys_call_table(, %rax, 8)
-	movq	%rax, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-1:
-	/* Use IRET because user could have changed pt_regs->foo */
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	jmp	return_from_SYSCALL_64
 
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+	/* IRQs are off. */
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	call	do_syscall_64		/* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
 	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
@@ -364,7 +325,7 @@ ENTRY(stub_ptregs_64)
 
 	/* Called from fast path -- pop return address and jump to slow path */
 	popq	%rax
-	jmp	tracesys	/* called from fast path */
+	jmp	entry_SYSCALL64_slow_path	/* called from fast path */
 
 1:
 	/* Called from C */
-- 
cgit 


From 5e9ebbd87a99ecc6abb74325b0ac63c46891f6f3 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Sat, 30 Jan 2016 14:01:12 +0600
Subject: x86/boot: Micro-optimize reset_early_page_tables()

Save 25 bytes of code and make the bootup a tiny bit faster:

     text    data bss             dec             filename
  9735144 4970776 15474688        30180608        vmlinux.old
  9735119 4970776 15474688        30180583        vmlinux

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454140872-16926-1-git-send-email-kuleshovmail@gmail.com
[ Fixed various small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head64.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f129a9af6357..35843caa4ea7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
 {
-	unsigned long i;
-
-	for (i = 0; i < PTRS_PER_PGD-1; i++)
-		early_level4_pgt[i].pgd = 0;
-
+	memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
 	next_early_pgt = 0;
-
 	write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
 int __init early_make_pgtable(unsigned long address)
 {
 	unsigned long physaddr = address - __PAGE_OFFSET;
-	unsigned long i;
 	pgdval_t pgd, *pgd_p;
 	pudval_t pud, *pud_p;
 	pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
 		}
 
 		pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-		for (i = 0; i < PTRS_PER_PUD; i++)
-			pud_p[i] = 0;
+		memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD);
 		*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
 		}
 
 		pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			pmd_p[i] = 0;
+		memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
-- 
cgit 


From f33b14a4b96b185634848046f54fb0d5028566a9 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:20 +0100
Subject: x86/e820: Set System RAM type and descriptor

Change e820_reserve_resources() to set 'flags' and 'desc' from
e820 types.

Set E820_RESERVED_KERN and E820_RAM's (System RAM) io resource
type to IORESOURCE_SYSTEM_RAM.

Do the same for "Kernel data", "Kernel code", and "Kernel bss",
which are child nodes of System RAM.

I/O resource descriptor is set to 'desc' for entries that are
(and will be) target ranges of walk_iomem_res() and
region_intersects().

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: WANG Chao <chaowang@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/e820.c  | 38 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/setup.c |  6 +++---
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 569c1e4f96fe..837365f10912 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -925,6 +925,41 @@ static const char *e820_type_to_string(int e820_type)
 	}
 }
 
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+	switch (e820_type) {
+	case E820_RESERVED_KERN:
+	case E820_RAM:
+		return IORESOURCE_SYSTEM_RAM;
+	case E820_ACPI:
+	case E820_NVS:
+	case E820_UNUSABLE:
+	case E820_PRAM:
+	case E820_PMEM:
+	default:
+		return IORESOURCE_MEM;
+	}
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+	switch (e820_type) {
+	case E820_ACPI:
+		return IORES_DESC_ACPI_TABLES;
+	case E820_NVS:
+		return IORES_DESC_ACPI_NV_STORAGE;
+	case E820_PMEM:
+		return IORES_DESC_PERSISTENT_MEMORY;
+	case E820_PRAM:
+		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+	case E820_RESERVED_KERN:
+	case E820_RAM:
+	case E820_UNUSABLE:
+	default:
+		return IORES_DESC_NONE;
+	}
+}
+
 static bool do_mark_busy(u32 type, struct resource *res)
 {
 	/* this is the legacy bios/dos rom-shadow + mmio region */
@@ -967,7 +1002,8 @@ void __init e820_reserve_resources(void)
 		res->start = e820.map[i].addr;
 		res->end = end;
 
-		res->flags = IORESOURCE_MEM;
+		res->flags = e820_type_to_iomem_type(e820.map[i].type);
+		res->desc = e820_type_to_iores_desc(e820.map[i].type);
 
 		/*
 		 * don't register the region that could be conflicted with
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3d80e6d42a2..aa52c1009475 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -152,21 +152,21 @@ static struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
 	.name	= "Kernel bss",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 
-- 
cgit 


From f0f4711aa16b82016c0b6e59871934bbd71258da Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:30 +0100
Subject: x86, kexec, nvdimm: Use walk_iomem_res_desc() for iomem search

Change the callers of walk_iomem_res() scanning for the
following resources by name to use walk_iomem_res_desc()
instead.

 "ACPI Tables"
 "ACPI Non-volatile Storage"
 "Persistent Memory (legacy)"
 "Crash kernel"

Note, the caller of walk_iomem_res() with "GART" will be removed
in a later patch.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chun-Yi <joeyli.kernel@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Lee, Chun-Yi <joeyli.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kexec@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-nvdimm@lists.01.org
Link: http://lkml.kernel.org/r/1453841853-11383-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/crash.c | 4 ++--
 arch/x86/kernel/pmem.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 58f34319b29a..35e152eeb6e0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -599,12 +599,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
 	/* Add ACPI tables */
 	cmd.type = E820_ACPI;
 	flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-	walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+	walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
 		       memmap_entry_callback);
 
 	/* Add ACPI Non-volatile Storage */
 	cmd.type = E820_NVS;
-	walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+	walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
 			memmap_entry_callback);
 
 	/* Add crashk_low_res region */
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 14415aff1813..92f70147a9a6 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
 
 static __init int register_e820_pmem(void)
 {
-	char *pmem = "Persistent Memory (legacy)";
 	struct platform_device *pdev;
 	int rc;
 
-	rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+				 IORESOURCE_MEM, 0, -1, NULL, found);
 	if (rc <= 0)
 		return 0;
 
-- 
cgit 


From f296f2634920d205b93d878b48d87bb7e0a4c256 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:31 +0100
Subject: x86/kexec: Remove walk_iomem_res() call with GART type

There is no longer any driver inserting a "GART" region in the
kernel since

  707d4eefbdb3 ("Revert "[PATCH] Insert GART region into resource map"").

Remove the call to walk_iomem_res() with "GART" type, its
callback function, and GART-specific variables set by the
callback.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chun-Yi <joeyli.kernel@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Lee, Chun-Yi <joeyli.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: kexec@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/crash.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 35e152eeb6e0..9ef978d69c22 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,10 +57,9 @@ struct crash_elf_data {
 	struct kimage *image;
 	/*
 	 * Total number of ram ranges we have after various adjustments for
-	 * GART, crash reserved region etc.
+	 * crash reserved region, etc.
 	 */
 	unsigned int max_nr_ranges;
-	unsigned long gart_start, gart_end;
 
 	/* Pointer to elf header */
 	void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
 	return 0;
 }
 
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
-	struct crash_elf_data *ced = arg;
-
-	ced->gart_start = start;
-	ced->gart_end = end;
-
-	/* Not expecting more than 1 gart aperture */
-	return 1;
-}
-
 
 /* Gather all the required information to prepare elf headers for ram regions */
 static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
 
 	ced->max_nr_ranges = nr_ranges;
 
-	/*
-	 * We don't create ELF headers for GART aperture as an attempt
-	 * to dump this memory in second kernel leads to hang/crash.
-	 * If gart aperture is present, one needs to exclude that region
-	 * and that could lead to need of extra phdr.
-	 */
-	walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
-				ced, get_gart_ranges_callback);
-
-	/*
-	 * If we have gart region, excluding that could potentially split
-	 * a memory range, resulting in extra header. Account for  that.
-	 */
-	if (ced->gart_end)
-		ced->max_nr_ranges++;
-
 	/* Exclusion of crash region could split memory ranges */
 	ced->max_nr_ranges++;
 
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
 			return ret;
 	}
 
-	/* Exclude GART region */
-	if (ced->gart_end) {
-		ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
-		if (ret)
-			return ret;
-	}
-
 	return ret;
 }
 
-- 
cgit 


From cd4d09ec6f6c12a2cc3db5b7d8876a325a53545b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:04 +0100
Subject: x86/cpufeature: Carve out X86_FEATURE_*

Move them to a separate header and have the following
dependency:

  x86/cpufeatures.h <- x86/processor.h <- x86/cpufeature.h

This makes it easier to use the header in asm code and not
include the whole cpufeature.h and add guards for asm.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/cpuflags.h                 |   2 +-
 arch/x86/boot/mkcpustr.c                 |   2 +-
 arch/x86/crypto/crc32-pclmul_glue.c      |   2 +-
 arch/x86/crypto/crc32c-intel_glue.c      |   2 +-
 arch/x86/crypto/crct10dif-pclmul_glue.c  |   2 +-
 arch/x86/entry/common.c                  |   1 +
 arch/x86/entry/entry_32.S                |   2 +-
 arch/x86/entry/vdso/vdso32-setup.c       |   1 -
 arch/x86/entry/vdso/vdso32/system_call.S |   2 +-
 arch/x86/entry/vdso/vma.c                |   1 +
 arch/x86/include/asm/alternative.h       |   6 -
 arch/x86/include/asm/apic.h              |   1 -
 arch/x86/include/asm/arch_hweight.h      |   2 +
 arch/x86/include/asm/cmpxchg.h           |   1 +
 arch/x86/include/asm/cpufeature.h        | 284 +-----------------------------
 arch/x86/include/asm/cpufeatures.h       | 288 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/fpu/internal.h      |   1 +
 arch/x86/include/asm/irq_work.h          |   2 +-
 arch/x86/include/asm/mwait.h             |   2 +
 arch/x86/include/asm/processor.h         |   3 +-
 arch/x86/include/asm/smap.h              |   2 +-
 arch/x86/include/asm/smp.h               |   1 -
 arch/x86/include/asm/thread_info.h       |   2 +-
 arch/x86/include/asm/tlbflush.h          |   1 +
 arch/x86/include/asm/uaccess_64.h        |   2 +-
 arch/x86/kernel/cpu/Makefile             |   2 +-
 arch/x86/kernel/cpu/centaur.c            |   2 +-
 arch/x86/kernel/cpu/cyrix.c              |   1 +
 arch/x86/kernel/cpu/intel.c              |   2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c    |   2 +-
 arch/x86/kernel/cpu/match.c              |   2 +-
 arch/x86/kernel/cpu/mkcapflags.sh        |   6 +-
 arch/x86/kernel/cpu/mtrr/main.c          |   2 +-
 arch/x86/kernel/cpu/transmeta.c          |   2 +-
 arch/x86/kernel/e820.c                   |   1 +
 arch/x86/kernel/head_32.S                |   2 +-
 arch/x86/kernel/hpet.c                   |   1 +
 arch/x86/kernel/msr.c                    |   2 +-
 arch/x86/kernel/verify_cpu.S             |   2 +-
 arch/x86/lib/clear_page_64.S             |   2 +-
 arch/x86/lib/copy_page_64.S              |   2 +-
 arch/x86/lib/copy_user_64.S              |   2 +-
 arch/x86/lib/memcpy_64.S                 |   2 +-
 arch/x86/lib/memmove_64.S                |   2 +-
 arch/x86/lib/memset_64.S                 |   2 +-
 arch/x86/mm/setup_nx.c                   |   1 +
 arch/x86/oprofile/op_model_amd.c         |   1 -
 arch/x86/um/asm/barrier.h                |   2 +-
 48 files changed, 334 insertions(+), 326 deletions(-)
 create mode 100644 arch/x86/include/asm/cpufeatures.h

(limited to 'arch/x86')

diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index ea97697e51e4..4cb404fd45ce 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -1,7 +1,7 @@
 #ifndef BOOT_CPUFLAGS_H
 #define BOOT_CPUFLAGS_H
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/processor-flags.h>
 
 struct cpu_features {
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 637097e66a62..f72498dc90d2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -17,7 +17,7 @@
 
 #include "../include/asm/required-features.h"
 #include "../include/asm/disabled-features.h"
-#include "../include/asm/cpufeature.h"
+#include "../include/asm/cpufeatures.h"
 #include "../kernel/cpu/capflags.c"
 
 int main(void)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 07d2c6c86a54..27226df3f7d8 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -33,7 +33,7 @@
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/api.h>
 
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 0e9871693f24..0857b1a1de3b 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -30,7 +30,7 @@
 #include <linux/kernel.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/internal.h>
 
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index a3fcfc97a311..cd4df9322501 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -30,7 +30,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 
 asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 75175f92f462..c6ab2ebb5f4f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
 #include <asm/traps.h>
 #include <asm/vdso.h>
 #include <asm/uaccess.h>
+#include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 77d8c5112900..4c5228352744 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 08a317a9ae4b..7853b53959cd 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
 
-#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/vdso.h>
 
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 3a1d9297074b..0109ac6cb79c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
 */
 
 #include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 7c912fefe79b..429d54d01b38 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/hpet.h>
 #include <asm/desc.h>
+#include <asm/cpufeature.h>
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 7bfc85bbb8ff..99afb665a004 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -151,12 +151,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
 	".popsection"
 
-/*
- * This must be included *after* the definition of ALTERNATIVE due to
- * <asm/arch_hweight.h>
- */
-#include <asm/cpufeature.h>
-
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c80f6b6f3da2..0899cfc8dfe8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -6,7 +6,6 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/processor.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 259a7c1ef709..02e799fa43d1 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#include <asm/cpufeatures.h>
+
 #ifdef CONFIG_64BIT
 /* popcnt %edi, %eax -- redundant REX prefix for alignment */
 #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index ad19841eddfe..9733361fed6f 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
 #define ASM_X86_CMPXCHG_H
 
 #include <linux/compiler.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
 /*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bbf166e805be..3cce9f3c5cb1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -1,289 +1,7 @@
-/*
- * Defines x86 CPU feature bits
- */
 #ifndef _ASM_X86_CPUFEATURE_H
 #define _ASM_X86_CPUFEATURE_H
 
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#include <asm/required-features.h>
-#endif
-
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#include <asm/disabled-features.h>
-#endif
-
-#define NCAPINTS	16	/* N 32-bit words worth of info */
-#define NBUGINTS	1	/* N 32-bit bug flags */
-
-/*
- * Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name.  If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
- */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
-					  /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
-#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7.
- *
- * Reuse free bits when adding new feature flags!
- */
-
-#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-
-#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-
-#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
-
-/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
-
-#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
-
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
-#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
-
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
-
-/*
- * BUG word(s)
- */
-#define X86_BUG(x)		(NCAPINTS*32 + (x))
-
-#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#include <asm/processor.h>
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 000000000000..0ceb6adc8a48
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,288 @@
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS	16	/* N 32-bit words worth of info */
+#define NBUGINTS	1	/* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
+					  /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+
+#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+
+#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x)		(NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0fd440df63f1..d01199def781 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -17,6 +17,7 @@
 #include <asm/user.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
 
 /*
  * High level FPU state handling functions:
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 78162f8e248b..d0afb05c84fc 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_IRQ_WORK_H
 #define _ASM_IRQ_WORK_H
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 
 static inline bool arch_irq_work_has_interrupt(void)
 {
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index c70689b5e5aa..0deeb2d26df7 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -3,6 +3,8 @@
 
 #include <linux/sched.h>
 
+#include <asm/cpufeature.h>
+
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
 #define MWAIT_SUBSTATE_SIZE		4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50cb61a2..491a3d9dbb15 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -13,7 +13,7 @@ struct vm86;
 #include <asm/types.h>
 #include <uapi/asm/sigcontext.h>
 #include <asm/current.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 #include <asm/percpu.h>
@@ -24,7 +24,6 @@ struct vm86;
 #include <asm/fpu/types.h>
 
 #include <linux/personality.h>
-#include <linux/cpumask.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/math64.h>
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index ba665ebd17bb..db333300bd4b 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -15,7 +15,7 @@
 
 #include <linux/stringify.h>
 #include <asm/nops.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 
 /* "Raw" instruction opcodes */
 #define __ASM_CLAC	.byte 0x0f,0x01,0xca
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index dfcf0727623b..20a3de5cb3b0 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,6 @@
 #endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
-#include <asm/cpufeature.h>
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c7b551028740..c0778fcab06d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/atomic.h>
 
 struct thread_info {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029405a3..0bb31cb8c73b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 
 #include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index b89c34c4019b..307698688fa1 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,7 +8,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <asm/alternative.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 
 /*
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 58031303e304..faa7b5204129 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -64,7 +64,7 @@ ifdef CONFIG_X86_FEATURE_NAMES
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
 
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
 
 targets += capflags.c
 $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index ae20be6e483c..6608c03c2126 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,7 +1,7 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index aaf152e79637..15e47c1cd412 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -8,6 +8,7 @@
 #include <linux/timer.h>
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
+#include <asm/cpufeature.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 565648bc1a0a..9299e3bdfad6 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/bugs.h>
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0b6c52388cf4..341449c49f34 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -14,7 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index afa9f0d487ea..fbb5e90557a5 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,5 +1,5 @@
 #include <asm/cpu_device_id.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 3f20710a5b23..6988c74409a8 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
 #
 
 IN=$1
@@ -49,8 +49,8 @@ dump_array()
 trap 'rm "$OUT"' EXIT
 
 (
-	echo "#ifndef _ASM_X86_CPUFEATURE_H"
-	echo "#include <asm/cpufeature.h>"
+	echo "#ifndef _ASM_X86_CPUFEATURES_H"
+	echo "#include <asm/cpufeatures.h>"
 	echo "#endif"
 	echo ""
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c3d149ee91c..74f1d90f9c29 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -47,7 +47,7 @@
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 252da7aceca6..a19a663282b5 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 569c1e4f96fe..b3c2a697820a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -24,6 +24,7 @@
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
+#include <asm/cpufeature.h>
 
 /*
  * The e820 map is the map that gets modified e.g. with command line parameters
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae24b6d2..af1112980dd4 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,7 +19,7 @@
 #include <asm/setup.h>
 #include <asm/processor-flags.h>
 #include <asm/msr-index.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
 #include <asm/bootparam.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b8e6ff5cd5d0..be0ebbb6d1d1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/cpufeature.h>
 #include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 64f9616f93f1..7f3550acde1b 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 
 static struct class *msr_class;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 07efb35ee4bc..014ea59aa153 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -30,7 +30,7 @@
  * 	appropriately. Either display a message or halt.
  */
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 
 verify_cpu:
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a2fe51b00cce..65be7cfaf947 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,5 @@
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 009f98216b7e..24ef1c2104d4 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,7 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 982ce34f4a9b..fba343062055 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -10,7 +10,7 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..a0de849435ad 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index ca2afdd6d98e..90ce01bee00c 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,7 @@
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 #undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2661fad05827..c9c81227ea37 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 .weak memset
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 92e2eacb3321..f65a33f505b6 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -4,6 +4,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/proto.h>
+#include <asm/cpufeature.h>
 
 static int disable_nx;
 
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 50d86c0e9ba4..660a83c8287b 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -24,7 +24,6 @@
 #include <asm/nmi.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
-#include <asm/cpufeature.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 174781a404ff..00c319048d52 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -3,7 +3,7 @@
 
 #include <asm/asm.h>
 #include <asm/segment.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cmpxchg.h>
 #include <asm/nops.h>
 
-- 
cgit 


From bc696ca05f5a8927329ec276a892341e006b00ba Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:05 +0100
Subject: x86/cpufeature: Replace the old static_cpu_has() with safe variant

So the old one didn't work properly before alternatives had run.
And it was supposed to provide an optimized JMP because the
assumption was that the offset it is jumping to is within a
signed byte and thus a two-byte JMP.

So I did an x86_64 allyesconfig build and dumped all possible
sites where static_cpu_has() was used. The optimization amounted
to all in all 12(!) places where static_cpu_has() had generated
a 2-byte JMP. Which has saved us a whopping 36 bytes!

This clearly is not worth the trouble so we can remove it. The
only place where the optimization might count - in __switch_to()
- we will handle differently. But that's not subject of this
patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug               |  10 ----
 arch/x86/include/asm/cpufeature.h    | 100 +++--------------------------------
 arch/x86/include/asm/fpu/internal.h  |  14 ++---
 arch/x86/kernel/apic/apic_numachip.c |   4 +-
 arch/x86/kernel/cpu/common.c         |  12 +----
 arch/x86/kernel/vm86_32.c            |   2 +-
 6 files changed, 19 insertions(+), 123 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed97a8a2..68a2d1f0a683 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST
 
 	  If unsure say N here.
 
-config X86_DEBUG_STATIC_CPU_HAS
-	bool "Debug alternatives"
-	depends on DEBUG_KERNEL
-	---help---
-	  This option causes additional code to be generated which
-	  fails if static_cpu_has() is used before alternatives have
-	  run.
-
-	  If unsure, say N.
-
 config X86_DEBUG_FPU
 	bool "Debug the x86 FPU code"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3cce9f3c5cb1..a261cf2e7907 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,103 +125,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 /*
- * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * Do not add any more of those clumsy macros - use static_cpu_has() for
  * fast paths and boot_cpu_has() otherwise!
  */
 
 #if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern void warn_pre_alternatives(void);
-extern bool __static_cpu_has_safe(u16 bit);
+extern bool __static_cpu_has(u16 bit);
 
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These are only valid after alternatives have run, but will statically
  * patch the target code for additional performance.
  */
-static __always_inline __pure bool __static_cpu_has(u16 bit)
-{
-#ifdef CC_HAVE_ASM_GOTO
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-
-		/*
-		 * Catch too early usage of this before alternatives
-		 * have run.
-		 */
-		asm_volatile_goto("1: jmp %l[t_warn]\n"
-			 "2:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"
-			 " .long 0\n"		/* no replacement */
-			 " .word %P0\n"		/* 1: do replace */
-			 " .byte 2b - 1b\n"	/* source len */
-			 " .byte 0\n"		/* replacement len */
-			 " .byte 0\n"		/* pad len */
-			 ".previous\n"
-			 /* skipping size check since replacement size = 0 */
-			 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
-
-#endif
-
-		asm_volatile_goto("1: jmp %l[t_no]\n"
-			 "2:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"
-			 " .long 0\n"		/* no replacement */
-			 " .word %P0\n"		/* feature bit */
-			 " .byte 2b - 1b\n"	/* source len */
-			 " .byte 0\n"		/* replacement len */
-			 " .byte 0\n"		/* pad len */
-			 ".previous\n"
-			 /* skipping size check since replacement size = 0 */
-			 : : "i" (bit) : : t_no);
-		return true;
-	t_no:
-		return false;
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-	t_warn:
-		warn_pre_alternatives();
-		return false;
-#endif
-
-#else /* CC_HAVE_ASM_GOTO */
-
-		u8 flag;
-		/* Open-coded due to __stringify() in ALTERNATIVE() */
-		asm volatile("1: movb $0,%0\n"
-			     "2:\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"
-			     " .long 3f - .\n"
-			     " .word %P1\n"		/* feature bit */
-			     " .byte 2b - 1b\n"		/* source len */
-			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "3: movb $1,%0\n"
-			     "4:\n"
-			     ".previous\n"
-			     : "=qm" (flag) : "i" (bit));
-		return flag;
-
-#endif /* CC_HAVE_ASM_GOTO */
-}
-
-#define static_cpu_has(bit)					\
-(								\
-	__builtin_constant_p(boot_cpu_has(bit)) ?		\
-		boot_cpu_has(bit) :				\
-	__builtin_constant_p(bit) ?				\
-		__static_cpu_has(bit) :				\
-		boot_cpu_has(bit)				\
-)
-
-static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
+static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
 		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
@@ -255,7 +171,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 	t_no:
 		return false;
 	t_dynamic:
-		return __static_cpu_has_safe(bit);
+		return __static_cpu_has(bit);
 #else
 		u8 flag;
 		/* Open-coded due to __stringify() in ALTERNATIVE() */
@@ -293,22 +209,21 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     ".previous\n"
 			     : "=qm" (flag)
 			     : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-		return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
+		return (flag == 2 ? __static_cpu_has(bit) : flag);
 #endif /* CC_HAVE_ASM_GOTO */
 }
 
-#define static_cpu_has_safe(bit)				\
+#define static_cpu_has(bit)					\
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
 		boot_cpu_has(bit) :				\
-		_static_cpu_has_safe(bit)			\
+		_static_cpu_has(bit)				\
 )
 #else
 /*
  * gcc 3.x is too stupid to do the static test; fall back to dynamic.
  */
 #define static_cpu_has(bit)		boot_cpu_has(bit)
-#define static_cpu_has_safe(bit)	boot_cpu_has(bit)
 #endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
@@ -316,7 +231,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 #define clear_cpu_bug(c, bit)		clear_cpu_cap(c, (bit))
 
 #define static_cpu_has_bug(bit)		static_cpu_has((bit))
-#define static_cpu_has_bug_safe(bit)	static_cpu_has_safe((bit))
 #define boot_cpu_has_bug(bit)		cpu_has_bug(&boot_cpu_data, (bit))
 
 #define MAX_CPU_FEATURES		(NCAPINTS * 32)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d01199def781..c2e46eb96b6d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -59,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
  */
 static __always_inline __pure bool use_eager_fpu(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+	return static_cpu_has(X86_FEATURE_EAGER_FPU);
 }
 
 static __always_inline __pure bool use_xsaveopt(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+	return static_cpu_has(X86_FEATURE_XSAVEOPT);
 }
 
 static __always_inline __pure bool use_xsave(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_XSAVE);
+	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
 static __always_inline __pure bool use_fxsr(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_FXSR);
+	return static_cpu_has(X86_FEATURE_FXSR);
 }
 
 /*
@@ -301,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+	if (static_cpu_has(X86_FEATURE_XSAVES))
 		XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
 	else
 		XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
@@ -323,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+	if (static_cpu_has(X86_FEATURE_XSAVES))
 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 	else
 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -461,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
 	 * pending. Clear the x87 state here by setting it to fixed values.
 	 * "m" is a random variable that should be in L1.
 	 */
-	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
 		asm volatile(
 			"fnclex\n\t"
 			"emms\n\t"
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c80c02c6ec49..ab5c2c685a3c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
 	unsigned long value;
 	unsigned int id = (x >> 24) & 0xff;
 
-	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+	if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		id |= (value << 2) & 0xff00;
 	}
@@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 	this_cpu_write(cpu_llc_id, node);
 
 	/* Account for nodes per socket in multi-core-module processors */
-	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+	if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
 		rdmsrl(MSR_FAM10H_NODE_ID, val);
 		nodes = ((val >> 3) & 7) + 1;
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de8f60a..ee499817f3f5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1475,19 +1475,11 @@ void cpu_init(void)
 }
 #endif
 
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-void warn_pre_alternatives(void)
-{
-	WARN(1, "You're using static_cpu_has before alternatives have run!\n");
-}
-EXPORT_SYMBOL_GPL(warn_pre_alternatives);
-#endif
-
-inline bool __static_cpu_has_safe(u16 bit)
+inline bool __static_cpu_has(u16 bit)
 {
 	return boot_cpu_has(bit);
 }
-EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+EXPORT_SYMBOL_GPL(__static_cpu_has);
 
 static void bsp_resume(void)
 {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e574b8546518..3dce1ca0a653 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	/* make room for real-mode segments */
 	tsk->thread.sp0 += 16;
 
-	if (static_cpu_has_safe(X86_FEATURE_SEP))
+	if (static_cpu_has(X86_FEATURE_SEP))
 		tsk->thread.sysenter_cs = 0;
 
 	load_sp0(tss, &tsk->thread);
-- 
cgit 


From a362bf9f5e7dd659b96d01382da7b855f4e5a7a1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 27 Jan 2016 09:43:25 +0100
Subject: x86/cpufeature: Get rid of the non-asm goto variant

I can simply quote hpa from the mail:

  "Get rid of the non-asm goto variant and just fall back to
   dynamic if asm goto is unavailable. It doesn't make any sense,
   really, if it is supposed to be safe, and by now the asm
   goto-capable gcc is in more wide use. (Originally the gcc 3.x
   fallback to pure dynamic didn't exist, either.)"

Booy, am I lazy.

Cleanup the whole CC_HAVE_ASM_GOTO ifdeffery too, while at it.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160127084325.GB30712@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 49 ++++-----------------------------------
 1 file changed, 5 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a261cf2e7907..9048c1bbc519 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -129,17 +129,16 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * fast paths and boot_cpu_has() otherwise!
  */
 
-#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 extern bool __static_cpu_has(u16 bit);
 
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These are only valid after alternatives have run, but will statically
- * patch the target code for additional performance.
+ * These will statically patch the target code for additional
+ * performance.
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-#ifdef CC_HAVE_ASM_GOTO
 		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
@@ -172,45 +171,6 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		return false;
 	t_dynamic:
 		return __static_cpu_has(bit);
-#else
-		u8 flag;
-		/* Open-coded due to __stringify() in ALTERNATIVE() */
-		asm volatile("1: movb $2,%0\n"
-			     "2:\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"		/* src offset */
-			     " .long 3f - .\n"		/* repl offset */
-			     " .word %P2\n"		/* always replace */
-			     " .byte 2b - 1b\n"		/* source len */
-			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "3: movb $0,%0\n"
-			     "4:\n"
-			     ".previous\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"		/* src offset */
-			     " .long 5f - .\n"		/* repl offset */
-			     " .word %P1\n"		/* feature bit */
-			     " .byte 4b - 3b\n"		/* src len */
-			     " .byte 6f - 5f\n"		/* repl len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "5: movb $1,%0\n"
-			     "6:\n"
-			     ".previous\n"
-			     : "=qm" (flag)
-			     : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-		return (flag == 2 ? __static_cpu_has(bit) : flag);
-#endif /* CC_HAVE_ASM_GOTO */
 }
 
 #define static_cpu_has(bit)					\
@@ -221,7 +181,8 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 )
 #else
 /*
- * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
  */
 #define static_cpu_has(bit)		boot_cpu_has(bit)
 #endif
-- 
cgit 


From 337e4cc84021212a87b04b77b65cccc49304909e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:07 +0100
Subject: x86/alternatives: Add an auxilary section

Add .altinstr_aux for additional instructions which will be used
before and/or during patching. All stuff which needs more
sophisticated patching should go there. See next patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..92dc211c11db 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -195,6 +195,17 @@ SECTIONS
 	:init
 #endif
 
+	/*
+	 * Section for code used exclusively before alternatives are run. All
+	 * references to such code must be patched out by alternatives, normally
+	 * by using X86_FEATURE_ALWAYS CPU feature bit.
+	 *
+	 * See static_cpu_has() for an example.
+	 */
+	.altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+		*(.altinstr_aux)
+	}
+
 	INIT_DATA_SECTION(16)
 
 	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-- 
cgit 


From 2476f2fa20568bd5d9e09cd35bcd73e99a6f4cc6 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 27 Jan 2016 09:45:25 +0100
Subject: x86/alternatives: Discard dynamic check after init

Move the code to do the dynamic check to the altinstr_aux
section so that it is discarded after alternatives have run and
a static branch has been chosen.

This way we're changing the dynamic branch from C code to
assembly, which makes it *substantially* smaller while avoiding
a completely unnecessary call to an out of line function.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
[ Changed it to do TESTB, as hpa suggested. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kristen Carlson Accardi <kristen@linux.intel.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1452972124-7380-1-git-send-email-brgerst@gmail.com
Link: http://lkml.kernel.org/r/20160127084525.GC30712@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 19 ++++++++++++-------
 arch/x86/kernel/cpu/common.c      |  6 ------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9048c1bbc519..9fba7a5dd24a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -130,8 +130,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  */
 
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern bool __static_cpu_has(u16 bit);
-
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -139,7 +137,7 @@ extern bool __static_cpu_has(u16 bit);
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
+		asm_volatile_goto("1: jmp 6f\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			         "((5f-4f) - (2b-1b)),0x90\n"
@@ -164,13 +162,20 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 			 " .byte 0\n"			/* repl len */
 			 " .byte 0\n"			/* pad len */
 			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
-			 : : t_dynamic, t_no);
+			 ".section .altinstr_aux,\"ax\"\n"
+			 "6:\n"
+			 " testb %[bitnum],%[cap_byte]\n"
+			 " jnz %l[t_yes]\n"
+			 " jmp %l[t_no]\n"
+			 ".previous\n"
+			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+			     [bitnum] "i" (1 << (bit & 7)),
+			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+			 : : t_yes, t_no);
+	t_yes:
 		return true;
 	t_no:
 		return false;
-	t_dynamic:
-		return __static_cpu_has(bit);
 }
 
 #define static_cpu_has(bit)					\
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ee499817f3f5..079d83fc6488 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1475,12 +1475,6 @@ void cpu_init(void)
 }
 #endif
 
-inline bool __static_cpu_has(u16 bit)
-{
-	return boot_cpu_has(bit);
-}
-EXPORT_SYMBOL_GPL(__static_cpu_has);
-
 static void bsp_resume(void)
 {
 	if (this_cpu->c_bsp_resume)
-- 
cgit 


From 8c725306993198f845038dc9e45a1267099867a6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:09 +0100
Subject: x86/vdso: Use static_cpu_has()

... and simplify and speed up a tad.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 429d54d01b38..10f704584922 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -285,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
 #ifdef CONFIG_NUMA
 	node = cpu_to_node(cpu);
 #endif
-	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+	if (static_cpu_has(X86_FEATURE_RDTSCP))
 		write_rdtscp_aux((node << 12) | cpu);
 
 	/*
-- 
cgit 


From a4733143085d6c782ac1e6c85778655b6bac1d4e Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 26 Jan 2016 22:12:10 +0100
Subject: x86/boot: Simplify kernel load address alignment check

We are using %rax as temporary register to check the kernel
address alignment. We don't really have to since the TEST
instruction does not clobber the destination operand.

Suggested-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453531828-19291-1-git-send-email-kuleshovmail@gmail.com
Link: http://lkml.kernel.org/r/1453842730-28463-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head_64.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..7c21029cb733 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -76,9 +76,7 @@ startup_64:
 	subq	$_text - __START_KERNEL_map, %rbp
 
 	/* Is the address not 2M aligned? */
-	movq	%rbp, %rax
-	andl	$~PMD_PAGE_MASK, %eax
-	testl	%eax, %eax
+	testl	$~PMD_PAGE_MASK, %ebp
 	jnz	bad_address
 
 	/*
-- 
cgit 


From b7765086b7c5a5be029a739c2caa161da51c2076 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:26 -0800
Subject: x86/entry/64: Fix an IRQ state error on ptregs-using syscalls

I messed up the IRQ state when jumping off the fast path due to
invocation of a ptregs-using syscall.  This bug shouldn't have
had any impact yet, but it would have caused problems with
subsequent context tracking cleanups.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C")
Link: http://lkml.kernel.org/r/ab92cd365fb7b0a56869e920017790d96610fdca.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 567aa522ac0a..9f7bb808035e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -191,8 +191,8 @@ entry_SYSCALL_64_fastpath:
 
 	/*
 	 * This call instruction is handled specially in stub_ptregs_64.
-	 * It might end up jumping to the slow path.  If it jumps, RAX is
-	 * clobbered.
+	 * It might end up jumping to the slow path.  If it jumps, RAX
+	 * and all argument registers are clobbered.
 	 */
 	call	*sys_call_table(, %rax, 8)
 .Lentry_SYSCALL_64_after_fastpath_call:
@@ -315,17 +315,24 @@ END(entry_SYSCALL_64)
 ENTRY(stub_ptregs_64)
 	/*
 	 * Syscalls marked as needing ptregs land here.
-	 * If we are on the fast path, we need to save the extra regs.
-	 * If we are on the slow path, the extra regs are already saved.
+	 * If we are on the fast path, we need to save the extra regs,
+	 * which we achieve by trying again on the slow path.  If we are on
+	 * the slow path, the extra regs are already saved.
 	 *
 	 * RAX stores a pointer to the C function implementing the syscall.
+	 * IRQs are on.
 	 */
 	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
 	jne	1f
 
-	/* Called from fast path -- pop return address and jump to slow path */
+	/*
+	 * Called from fast path -- disable IRQs again, pop return address
+	 * and jump to slow path
+	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	popq	%rax
-	jmp	entry_SYSCALL64_slow_path	/* called from fast path */
+	jmp	entry_SYSCALL64_slow_path
 
 1:
 	/* Called from C */
-- 
cgit 


From eb2a54c3271cb6443ae93ec44a91687b60c559a3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:27 -0800
Subject: x86/entry/64: Fix fast-path syscall return register state

I was fishing RIP (i.e. RCX) out of pt_regs->cx and RFLAGS (i.e.
R11) out of pt_regs->r11.  While it usually worked (pt_regs
started out with CX == IP and R11 == FLAGS), it was very
fragile.  In particular, it broke sys_iopl() because sys_iopl()
forgot to mark itself as using ptregs.

Undo that part of the syscall rework.  There was no compelling
reason to do it this way.  While I'm at it, load RCX and R11
before the other regs to be a little friendlier to the CPU, as
they will be the first of the reloaded registers to be used.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C")
Link: http://lkml.kernel.org/r/a85f8360c397e48186a9bc3e565ad74307a7b011.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9f7bb808035e..70eadb0ea5fa 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,7 +212,9 @@ entry_SYSCALL_64_fastpath:
 
 	LOCKDEP_SYS_EXIT
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
-	RESTORE_C_REGS
+	movq	RIP(%rsp), %rcx
+	movq	EFLAGS(%rsp), %r11
+	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
-- 
cgit 


From bb56968a37a44070de92d5690c4b08dd98a5d3f1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:28 -0800
Subject: x86/syscalls/64: Mark sys_iopl() as using ptregs

sys_iopl() both reads and writes pt_regs->flags.  Mark it as using ptregs.

This isn't strictly necessary, as pt_regs->flags is available
even in the fast path, but this is very lightweight now that we
have syscall qualifiers and it could avoid some pain down the
road.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3de0ca692fa8bf414c5e3d7afe3e6195d1a10e1f.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscalls/syscall_64.tbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dcf107ce2cd4..2e5b565adacc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
 169	common	reboot			sys_reboot
 170	common	sethostname		sys_sethostname
 171	common	setdomainname		sys_setdomainname
-172	common	iopl			sys_iopl
+172	common	iopl			sys_iopl/ptregs
 173	common	ioperm			sys_ioperm
 174	64	create_module
 175	common	init_module		sys_init_module
-- 
cgit 


From d99e1bd175f4291ddb6e62b22bb5bdbe3976389a Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Mon, 25 Jan 2016 20:41:46 +0100
Subject: x86/entry/traps: Refactor preemption and interrupt flag handling

Make the preemption and interrupt flag handling more readable by
removing preempt_conditional_sti() and preempt_conditional_cli()
helpers and using preempt_disable() and
preempt_enable_no_resched() instead.

Rename contitional_sti() and conditional_cli() to the more
understandable cond_local_irq_enable() and
cond_local_irq_disable() respectively, while at it.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
[ Boris: massage text. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453750913-4781-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 47 +++++++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..410e8e2700c5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
-static inline void conditional_sti(struct pt_regs *regs)
+static inline void cond_local_irq_enable(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-	preempt_count_inc();
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
-}
-
-static inline void conditional_cli(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_disable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void cond_local_irq_disable(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
-	preempt_count_dec();
 }
 
 void ist_enter(struct pt_regs *regs)
@@ -286,7 +272,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
-		conditional_sti(regs);
+		cond_local_irq_enable(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
@@ -368,7 +354,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		return;
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (!user_mode(regs))
 		die("bounds", regs, error_code);
@@ -443,7 +429,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	struct task_struct *tsk;
 
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
@@ -517,9 +503,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_conditional_sti(regs);
+	preempt_disable();
+	cond_local_irq_enable(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	cond_local_irq_disable(regs);
+	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 exit:
 	ist_exit(regs);
@@ -648,12 +636,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	preempt_disable();
+	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		preempt_conditional_cli(regs);
+		cond_local_irq_disable(regs);
+		preempt_enable_no_resched();
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -673,7 +663,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	preempt_conditional_cli(regs);
+	cond_local_irq_disable(regs);
+	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 
 exit:
@@ -696,7 +687,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
 		return;
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
@@ -743,7 +734,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 }
 
 dotraplinkage void
@@ -756,7 +747,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
 
-		conditional_sti(regs);
+		cond_local_irq_enable(regs);
 
 		info.regs = regs;
 		math_emulate(&info);
@@ -765,7 +756,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #endif
 	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 #endif
 }
 NOKPROBE_SYMBOL(do_device_not_available);
-- 
cgit 


From 16aaa53756501914a863ae7a15fcb070dc27c3d7 Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Mon, 25 Jan 2016 20:41:47 +0100
Subject: x86/cpufeature: Use enum cpuid_leafs instead of magic numbers

Most of the magic numbers in x86_capability[] have been converted to
'enum cpuid_leafs', and this patch updates the remaining part.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1453750913-4781-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/elf.h | 2 +-
 arch/x86/kernel/mpparse.c  | 2 +-
 arch/x86/lguest/boot.c     | 2 +-
 arch/x86/xen/enlighten.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1514753fd435..15340e36ddcb 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -256,7 +256,7 @@ extern int force_personality32;
    instruction set this CPU supports.  This could be done in user space,
    but it's not easy, and we've already done it here.  */
 
-#define ELF_HWCAP		(boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP		(boot_cpu_data.x86_capability[CPUID_1_EDX])
 
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 30ca7607cbbb..97340f2c437c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	processor.cpuflag = CPU_ENABLED;
 	processor.cpufeature = (boot_cpu_data.x86 << 8) |
 	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.featureflag = boot_cpu_data.x86_capability[0];
+	processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
 	processor.reserved[0] = 0;
 	processor.reserved[1] = 0;
 	for (i = 0; i < 2; i++) {
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229ac3f4f..a9033ae13369 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1535,7 +1535,7 @@ __init void lguest_init(void)
 	 */
 	cpu_detect(&new_cpu_data);
 	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 
 	/* Math is always hard! */
 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d09e4c9d7cc5..2c261082eadf 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1654,7 +1654,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	cpu_detect(&new_cpu_data);
 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
 	new_cpu_data.wp_works_ok = 1;
-	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 #endif
 
 	if (xen_start_info->mod_start) {
-- 
cgit 


From bfbe0eeb769e2aff2cb1fc6845c4e4b7eac40bb3 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:48 +0100
Subject: x86/mce: Fix order of AMD MCE init function call

In mce_amd_feature_init() we take decisions based on mce_flags
being set or not. So the feature detection using CPUID should
naturally be ordered before we call mce_amd_feature_init().

Fix that here.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..b7180801ea33 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1617,10 +1617,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	case X86_VENDOR_AMD: {
 		u32 ebx = cpuid_ebx(0x80000007);
 
-		mce_amd_feature_init(c);
 		mce_flags.overflow_recov = !!(ebx & BIT(0));
 		mce_flags.succor	 = !!(ebx & BIT(1));
 		mce_flags.smca		 = !!(ebx & BIT(3));
+		mce_amd_feature_init(c);
 
 		break;
 		}
-- 
cgit 


From 284b965c146f482b4a411133f62288d52b7e3a72 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:49 +0100
Subject: x86/mce/AMD: Do not perform shared bank check for future processors

Fam17h and above should not require a check to see if a bank is
shared or not. For shared banks, there will always be only one
core that has visibility over the MSRs and only that particular
core will be allowed to write to the MSRs.

Fix the code to return early if we have Scalable MCA support. No
change in functionality for earlier processors.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
[ Massaged the changelog text, fixed kbuild test robot build warning. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e99b15077e94..3068ce25dfa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -84,6 +84,13 @@ struct thresh_restart {
 
 static inline bool is_shared_bank(int bank)
 {
+	/*
+	 * Scalable MCA provides for only one core to have access to the MSRs of
+	 * a shared bank.
+	 */
+	if (mce_flags.smca)
+		return false;
+
 	/* Bank 4 is for northbridge reporting and is thus shared */
 	return (bank == 4);
 }
-- 
cgit 


From 60f116fca162d9488f783f5014779463243ab7a2 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:50 +0100
Subject: x86/mce/AMD: Reduce number of blocks scanned per bank

From Fam17h onwards, the number of extended MCx_MISC register blocks is
reduced to 4. It is an architectural change from what we had on
earlier processors.

Although theoritically the total number of extended MCx_MISC
registers was 8 in earlier processor families, in practice we
only had to use the extra registers for MC4. And only 2 of those
were used. So this change does not affect older processors.
Tested on Fam10h and Fam15h systems.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 3068ce25dfa1..5982227990c9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,7 +28,7 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
 #define MASK_VALID_HI     0x80000000
-- 
cgit 


From f57a1f3c14b9182f1fea667f5a38a1094699db7c Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:51 +0100
Subject: x86/mce/AMD: Fix LVT offset configuration for thresholding

For processor families with the Scalable MCA feature, the LVT
offset for threshold interrupts is configured only in MSR
0xC0000410 and not in each per bank MISC register as was done in
earlier families.

Obtain the LVT offset from the correct MSR for those families.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5982227990c9..a77a4521976a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -49,6 +49,11 @@
 #define DEF_LVT_OFF		0x2
 #define DEF_INT_TYPE_APIC	0x2
 
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF	0xF000
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -142,6 +147,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	}
 
 	if (apic != msr) {
+		/*
+		 * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+		 * the BIOS provides the value. The original field where LVT offset
+		 * was set is reserved. Return early here:
+		 */
+		if (mce_flags.smca)
+			return 0;
+
 		pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
 		       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
 		       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -300,7 +313,19 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 				goto init;
 
 			b.interrupt_enable = 1;
-			new	= (high & MASK_LVTOFF_HI) >> 20;
+
+			if (mce_flags.smca) {
+				u32 smca_low, smca_high;
+
+				/* Gather LVT offset for thresholding: */
+				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+					break;
+
+				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+			} else {
+				new = (high & MASK_LVTOFF_HI) >> 20;
+			}
+
 			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&
-- 
cgit 


From 429893b16d35d309ed6b35136aad5f908a08d9b9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 25 Jan 2016 20:41:52 +0100
Subject: x86/mce/AMD: Carve out threshold block preparation

mce_amd_feature_init() was getting pretty fat, carve out the
threshold_block setup into a separate function in order to
simplify flow and make it more understandable.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1453750913-4781-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 87 ++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a77a4521976a..f2860a118b71 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -267,14 +267,59 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+			int offset, u32 misc_high)
+{
+	unsigned int cpu = smp_processor_id();
+	struct threshold_block b;
+	int new;
+
+	if (!block)
+		per_cpu(bank_map, cpu) |= (1 << bank);
+
+	memset(&b, 0, sizeof(b));
+	b.cpu			= cpu;
+	b.bank			= bank;
+	b.block			= block;
+	b.address		= addr;
+	b.interrupt_capable	= lvt_interrupt_supported(bank, misc_high);
+
+	if (!b.interrupt_capable)
+		goto done;
+
+	b.interrupt_enable = 1;
+
+	if (mce_flags.smca) {
+		u32 smca_low, smca_high;
+
+		/* Gather LVT offset for thresholding: */
+		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+			goto out;
+
+		new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+	} else {
+		new = (misc_high & MASK_LVTOFF_HI) >> 20;
+	}
+
+	offset = setup_APIC_mce_threshold(offset, new);
+
+	if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+		mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+	mce_threshold_block_init(&b, offset);
+
+out:
+	return offset;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	struct threshold_block b;
-	unsigned int cpu = smp_processor_id();
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
-	int offset = -1, new;
+	int offset = -1;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -299,41 +344,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			     (high & MASK_LOCKED_HI))
 				continue;
 
-			if (!block)
-				per_cpu(bank_map, cpu) |= (1 << bank);
-
-			memset(&b, 0, sizeof(b));
-			b.cpu			= cpu;
-			b.bank			= bank;
-			b.block			= block;
-			b.address		= address;
-			b.interrupt_capable	= lvt_interrupt_supported(bank, high);
-
-			if (!b.interrupt_capable)
-				goto init;
-
-			b.interrupt_enable = 1;
-
-			if (mce_flags.smca) {
-				u32 smca_low, smca_high;
-
-				/* Gather LVT offset for thresholding: */
-				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-					break;
-
-				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
-			} else {
-				new = (high & MASK_LVTOFF_HI) >> 20;
-			}
-
-			offset  = setup_APIC_mce_threshold(offset, new);
-
-			if ((offset == new) &&
-			    (mce_threshold_vector != amd_threshold_interrupt))
-				mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-			mce_threshold_block_init(&b, offset);
+			offset = prepare_threshold_block(bank, block, address, offset, high);
 		}
 	}
 
-- 
cgit 


From e6c8f1873be8a14c7e44202df1f7e6ea61bf3352 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:53 +0100
Subject: x86/mce/AMD: Set MCAX Enable bit

It is required for the OS to acknowledge that it is using the
MCAX register set and its associated fields by setting the
'McaXEnable' bit in each bank's MCi_CONFIG register. If it is
not set, then all UC errors will cause a system panic.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h     |  4 ++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402ef3b84..552346598dab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,6 +264,10 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
+/* 'SMCA': AMD64 Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f2860a118b71..88de27bd5797 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -54,6 +54,14 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF	0x1
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -292,6 +300,12 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 
 	if (mce_flags.smca) {
 		u32 smca_low, smca_high;
+		u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+		if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+			smca_high |= SMCA_MCAX_EN_OFF;
+			wrmsr(smca_addr, smca_low, smca_high);
+		}
 
 		/* Gather LVT offset for thresholding: */
 		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-- 
cgit 


From 1b74dde7c47c19a73ea3e9fac95ac27b5d3d50c5 Mon Sep 17 00:00:00 2001
From: Chen Yucong <slaoub@gmail.com>
Date: Tue, 2 Feb 2016 11:45:02 +0800
Subject: x86/cpu: Convert printk(KERN_<LEVEL> ...) to pr_<level>(...)

 - Use the more current logging style pr_<level>(...) instead of the old
   printk(KERN_<LEVEL> ...).

 - Convert pr_warning() to pr_warn().

Signed-off-by: Chen Yucong <slaoub@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454384702-21707-1-git-send-email-slaoub@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c                   | 23 +++++++--------
 arch/x86/kernel/cpu/bugs_64.c               |  2 +-
 arch/x86/kernel/cpu/centaur.c               | 10 +++----
 arch/x86/kernel/cpu/common.c                | 42 +++++++++++++--------------
 arch/x86/kernel/cpu/cyrix.c                 | 10 +++----
 arch/x86/kernel/cpu/hypervisor.c            |  2 +-
 arch/x86/kernel/cpu/intel.c                 | 10 +++----
 arch/x86/kernel/cpu/intel_cacheinfo.c       |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-inject.c     | 15 +++++-----
 arch/x86/kernel/cpu/mcheck/p5.c             | 18 +++++-------
 arch/x86/kernel/cpu/mcheck/therm_throt.c    | 15 +++++-----
 arch/x86/kernel/cpu/mcheck/threshold.c      |  4 +--
 arch/x86/kernel/cpu/mcheck/winchip.c        |  5 ++--
 arch/x86/kernel/cpu/microcode/amd.c         |  2 +-
 arch/x86/kernel/cpu/mshyperv.c              |  8 +++---
 arch/x86/kernel/cpu/mtrr/centaur.c          |  2 +-
 arch/x86/kernel/cpu/mtrr/cleanup.c          | 44 ++++++++++++++---------------
 arch/x86/kernel/cpu/mtrr/generic.c          | 23 ++++++++-------
 arch/x86/kernel/cpu/mtrr/main.c             | 20 ++++++-------
 arch/x86/kernel/cpu/perf_event.c            |  9 +++---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c    | 10 +++----
 arch/x86/kernel/cpu/perf_event_amd_uncore.c |  4 +--
 arch/x86/kernel/cpu/perf_event_intel_ds.c   |  6 ++--
 arch/x86/kernel/cpu/rdrand.c                |  2 +-
 arch/x86/kernel/cpu/topology.c              |  4 +--
 arch/x86/kernel/cpu/transmeta.c             |  8 +++---
 arch/x86/kernel/cpu/vmware.c                |  5 ++--
 27 files changed, 146 insertions(+), 159 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a07956a08936..97c59fd60702 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -117,7 +117,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		void (*f_vide)(void);
 		u64 d, d2;
 
-		printk(KERN_INFO "AMD K6 stepping B detected - ");
+		pr_info("AMD K6 stepping B detected - ");
 
 		/*
 		 * It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -133,10 +133,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
-			printk(KERN_CONT
-				"system stability may be impaired when more than 32 MB are used.\n");
+			pr_cont("system stability may be impaired when more than 32 MB are used.\n");
 		else
-			printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+			pr_cont("probably OK (after B9730xxxx).\n");
 	}
 
 	/* K6 with old style WHCR */
@@ -154,7 +153,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 			wbinvd();
 			wrmsr(MSR_K6_WHCR, l, h);
 			local_irq_restore(flags);
-			printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+			pr_info("Enabling old style K6 write allocation for %d Mb\n",
 				mbytes);
 		}
 		return;
@@ -175,7 +174,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 			wbinvd();
 			wrmsr(MSR_K6_WHCR, l, h);
 			local_irq_restore(flags);
-			printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+			pr_info("Enabling new style K6 write allocation for %d Mb\n",
 				mbytes);
 		}
 
@@ -202,7 +201,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	 */
 	if (c->x86_model >= 6 && c->x86_model <= 10) {
 		if (!cpu_has(c, X86_FEATURE_XMM)) {
-			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+			pr_info("Enabling disabled K7/SSE Support.\n");
 			msr_clear_bit(MSR_K7_HWCR, 15);
 			set_cpu_cap(c, X86_FEATURE_XMM);
 		}
@@ -216,9 +215,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
 		rdmsr(MSR_K7_CLK_CTL, l, h);
 		if ((l & 0xfff00000) != 0x20000000) {
-			printk(KERN_INFO
-			    "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
-					l, ((l & 0x000fffff)|0x20000000));
+			pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+				l, ((l & 0x000fffff)|0x20000000));
 			wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
 		}
 	}
@@ -485,7 +483,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
 			unsigned long pfn = tseg >> PAGE_SHIFT;
 
-			printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+			pr_debug("tseg: %010llx\n", tseg);
 			if (pfn_range_is_mapped(pfn, pfn + 1))
 				set_memory_4k((unsigned long)__va(tseg), 1);
 		}
@@ -500,8 +498,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 			rdmsrl(MSR_K7_HWCR, val);
 			if (!(val & BIT(24)))
-				printk(KERN_WARNING FW_BUG "TSC doesn't count "
-					"with P0 frequency!\n");
+				pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 04f0fe5af83e..a972ac4c7e7d 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
 {
 	identify_boot_cpu();
 #if !defined(CONFIG_SMP)
-	printk(KERN_INFO "CPU: ");
+	pr_info("CPU: ");
 	print_cpu_info(&boot_cpu_data);
 #endif
 	alternative_instructions();
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index ae20be6e483c..ce197bb7c129 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
 			rdmsr(MSR_VIA_FCR, lo, hi);
 			lo |= ACE_FCR;		/* enable ACE unit */
 			wrmsr(MSR_VIA_FCR, lo, hi);
-			printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+			pr_info("CPU: Enabled ACE h/w crypto\n");
 		}
 
 		/* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
 			rdmsr(MSR_VIA_RNG, lo, hi);
 			lo |= RNG_ENABLE;	/* enable RNG unit */
 			wrmsr(MSR_VIA_RNG, lo, hi);
-			printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+			pr_info("CPU: Enabled h/w RNG\n");
 		}
 
 		/* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
 			name = "C6";
 			fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
 			fcr_clr = DPDC;
-			printk(KERN_NOTICE "Disabling bugged TSC.\n");
+			pr_notice("Disabling bugged TSC.\n");
 			clear_cpu_cap(c, X86_FEATURE_TSC);
 			break;
 		case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
 		newlo = (lo|fcr_set) & (~fcr_clr);
 
 		if (newlo != lo) {
-			printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+			pr_info("Centaur FCR was 0x%X now 0x%X\n",
 				lo, newlo);
 			wrmsr(MSR_IDT_FCR1, newlo, hi);
 		} else {
-			printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+			pr_info("Centaur FCR is 0x%X\n", lo);
 		}
 		/* Emulate MTRRs using Centaur's MCR. */
 		set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de8f60a..68a80e9b67fc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -228,7 +228,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 	lo |= 0x200000;
 	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
 
-	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	pr_notice("CPU serial number disabled.\n");
 	clear_cpu_cap(c, X86_FEATURE_PN);
 
 	/* Disabling the serial number may affect the cpuid level */
@@ -329,9 +329,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 		if (!warn)
 			continue;
 
-		printk(KERN_WARNING
-		       "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
-				x86_cap_flag(df->feature), df->level);
+		pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+			x86_cap_flag(df->feature), df->level);
 	}
 }
 
@@ -510,7 +509,7 @@ void detect_ht(struct cpuinfo_x86 *c)
 	smp_num_siblings = (ebx & 0xff0000) >> 16;
 
 	if (smp_num_siblings == 1) {
-		printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+		pr_info_once("CPU0: Hyper-Threading is disabled\n");
 		goto out;
 	}
 
@@ -531,10 +530,10 @@ void detect_ht(struct cpuinfo_x86 *c)
 
 out:
 	if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
+		pr_info("CPU: Physical Processor ID: %d\n",
+			c->phys_proc_id);
+		pr_info("CPU: Processor Core ID: %d\n",
+			c->cpu_core_id);
 		printed = 1;
 	}
 #endif
@@ -559,9 +558,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
 		}
 	}
 
-	printk_once(KERN_ERR
-			"CPU: vendor_id '%s' unknown, using generic init.\n" \
-			"CPU: Your system may be unstable.\n", v);
+	pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+		    "CPU: Your system may be unstable.\n", v);
 
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
@@ -760,7 +758,7 @@ void __init early_cpu_init(void)
 	int count = 0;
 
 #ifdef CONFIG_PROCESSOR_SELECT
-	printk(KERN_INFO "KERNEL supported cpus:\n");
+	pr_info("KERNEL supported cpus:\n");
 #endif
 
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +776,7 @@ void __init early_cpu_init(void)
 			for (j = 0; j < 2; j++) {
 				if (!cpudev->c_ident[j])
 					continue;
-				printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+				pr_info("  %s %s\n", cpudev->c_vendor,
 					cpudev->c_ident[j]);
 			}
 		}
@@ -1061,7 +1059,7 @@ static void __print_cpu_msr(void)
 		for (index = index_min; index < index_max; index++) {
 			if (rdmsrl_safe(index, &val))
 				continue;
-			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+			pr_info(" MSR%08x: %016llx\n", index, val);
 		}
 	}
 }
@@ -1100,19 +1098,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 	}
 
 	if (vendor && !strstr(c->x86_model_id, vendor))
-		printk(KERN_CONT "%s ", vendor);
+		pr_cont("%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
+		pr_cont("%s", c->x86_model_id);
 	else
-		printk(KERN_CONT "%d86", c->x86);
+		pr_cont("%d86", c->x86);
 
-	printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+	pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
 	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+		pr_cont(", stepping: 0x%x)\n", c->x86_mask);
 	else
-		printk(KERN_CONT ")\n");
+		pr_cont(")\n");
 
 	print_cpu_msr(c);
 }
@@ -1438,7 +1436,7 @@ void cpu_init(void)
 
 	show_ucode_info_early();
 
-	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+	pr_info("Initializing CPU#%d\n", cpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_VME) ||
 	    cpu_has_tsc ||
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index aaf152e79637..187bb583d0df 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -103,7 +103,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
 		local_irq_restore(flags);
 
 		if (ccr5 & 2) { /* possible wrong calibration done */
-			printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+			pr_info("Recalibrating delay loop with SLOP bit reset\n");
 			calibrate_delay();
 			c->loops_per_jiffy = loops_per_jiffy;
 		}
@@ -115,7 +115,7 @@ static void set_cx86_reorder(void)
 {
 	u8 ccr3;
 
-	printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+	pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
@@ -128,7 +128,7 @@ static void set_cx86_reorder(void)
 
 static void set_cx86_memwb(void)
 {
-	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+	pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
 	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -268,7 +268,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		 *  VSA1 we work around however.
 		 */
 
-		printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+		pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
 		isa_dma_bridge_buggy = 2;
 
 		/* We do this before the PCI layer is running. However we
@@ -426,7 +426,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
 		if (dir0 == 5 || dir0 == 3) {
 			unsigned char ccr3;
 			unsigned long flags;
-			printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+			pr_info("Enabling CPUID on Cyrix processor.\n");
 			local_irq_save(flags);
 			ccr3 = getCx86(CX86_CCR3);
 			/* enable MAPEN  */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index d820d8eae96b..73d391ae452f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
 	}
 
 	if (max_pri)
-		printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+		pr_info("Hypervisor detected: %s\n", x86_hyper->name);
 }
 
 void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 565648bc1a0a..05b9211ea0f7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	 */
 	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
 	    c->microcode < 0x20e) {
-		printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+		pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
 		clear_cpu_cap(c, X86_FEATURE_PSE);
 	}
 
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 		if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
-			printk(KERN_INFO "Disabled fast string operations\n");
+			pr_info("Disabled fast string operations\n");
 			setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
 			setup_clear_cpu_cap(X86_FEATURE_ERMS);
 		}
@@ -176,7 +176,7 @@ int ppro_with_ram_bug(void)
 	    boot_cpu_data.x86 == 6 &&
 	    boot_cpu_data.x86_model == 1 &&
 	    boot_cpu_data.x86_mask < 8) {
-		printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+		pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
 		return 1;
 	}
 	return 0;
@@ -225,7 +225,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 
 		set_cpu_bug(c, X86_BUG_F00F);
 		if (!f00f_workaround_enabled) {
-			printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+			pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
 			f00f_workaround_enabled = 1;
 		}
 	}
@@ -244,7 +244,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * Forcefully enable PAE if kernel parameter "forcepae" is present.
 	 */
 	if (forcepae) {
-		printk(KERN_WARNING "PAE forced!\n");
+		pr_warn("PAE forced!\n");
 		set_cpu_cap(c, X86_FEATURE_PAE);
 		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
 	}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0b6c52388cf4..6ed779efff26 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
 	err = amd_set_l3_disable_slot(nb, cpu, slot, val);
 	if (err) {
 		if (err == -EEXIST)
-			pr_warning("L3 slot %d in use/index already disabled!\n",
+			pr_warn("L3 slot %d in use/index already disabled!\n",
 				   slot);
 		return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 4cfba4371a71..517619ea6498 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -115,7 +115,7 @@ static int raise_local(void)
 	int cpu = m->extcpu;
 
 	if (m->inject_flags & MCJ_EXCEPTION) {
-		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+		pr_info("Triggering MCE exception on CPU %d\n", cpu);
 		switch (context) {
 		case MCJ_CTX_IRQ:
 			/*
@@ -128,15 +128,15 @@ static int raise_local(void)
 			raise_exception(m, NULL);
 			break;
 		default:
-			printk(KERN_INFO "Invalid MCE context\n");
+			pr_info("Invalid MCE context\n");
 			ret = -EINVAL;
 		}
-		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+		pr_info("MCE exception done on CPU %d\n", cpu);
 	} else if (m->status) {
-		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+		pr_info("Starting machine check poll CPU %d\n", cpu);
 		raise_poll(m);
 		mce_notify_irq();
-		printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+		pr_info("Machine check poll done on CPU %d\n", cpu);
 	} else
 		m->finished = 0;
 
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
 		start = jiffies;
 		while (!cpumask_empty(mce_inject_cpumask)) {
 			if (!time_before(jiffies, start + 2*HZ)) {
-				printk(KERN_ERR
-				"Timeout waiting for mce inject %lx\n",
+				pr_err("Timeout waiting for mce inject %lx\n",
 					*cpumask_bits(mce_inject_cpumask));
 				break;
 			}
@@ -241,7 +240,7 @@ static int inject_init(void)
 {
 	if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
 		return -ENOMEM;
-	printk(KERN_INFO "Machine check injector initialized\n");
+	pr_info("Machine check injector initialized\n");
 	register_mce_write_callback(mce_write);
 	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
 				"mce_notify");
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 12402e10aeff..2a0717bf8033 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
 
-	printk(KERN_EMERG
-		"CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-		smp_processor_id(), loaddr, lotype);
+	pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+		 smp_processor_id(), loaddr, lotype);
 
 	if (lotype & (1<<5)) {
-		printk(KERN_EMERG
-			"CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-			smp_processor_id());
+		pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+			 smp_processor_id());
 	}
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 	/* Read registers before enabling: */
 	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
 	rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-	printk(KERN_INFO
-	       "Intel old style machine check architecture supported.\n");
+	pr_info("Intel old style machine check architecture supported.\n");
 
 	/* Enable MCE: */
 	cr4_set_bits(X86_CR4_MCE);
-	printk(KERN_INFO
-	       "Intel old style machine check reporting enabled on CPU#%d.\n",
-	       smp_processor_id());
+	pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+		smp_processor_id());
 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 2c5aaf8c2e2f..0b445c2ff735 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 	/* if we just entered the thermal event */
 	if (new_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
-			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+			pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 	}
 	if (old_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
-			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-				this_cpu,
+			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
 
 static void unexpected_thermal_interrupt(void)
 {
-	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
-			smp_processor_id());
+	pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+		smp_processor_id());
 }
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		if (system_state == SYSTEM_BOOTING)
-			printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+			pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
 		return;
 	}
 
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 
-	printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
-		       tm2 ? "TM2" : "TM1");
+	pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+		      tm2 ? "TM2" : "TM1");
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 7245980186ee..fcf9ae9384f4 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -12,8 +12,8 @@
 
 static void default_threshold_interrupt(void)
 {
-	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-			 THRESHOLD_APIC_VECTOR);
+	pr_err("Unexpected threshold interrupt at vector %x\n",
+		THRESHOLD_APIC_VECTOR);
 }
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 01dd8702880b..c6a722e1d011 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
 	ist_enter(regs);
 
-	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+	pr_emerg("CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
 	ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
 
 	cr4_set_bits(X86_CR4_MCE);
 
-	printk(KERN_INFO
-	       "Winchip machine check reporting enabled on CPU#0.\n");
+	pr_info("Winchip machine check reporting enabled on CPU#0.\n");
 }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a76615..75d3aab5f7b2 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -953,7 +953,7 @@ struct microcode_ops * __init init_amd_microcode(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+		pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
 		return NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 20e242ea1bc4..4e7c6933691c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
 	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
-	       ms_hyperv.features, ms_hyperv.hints);
+	pr_info("HyperV: features 0x%x, hints 0x%x\n",
+		ms_hyperv.features, ms_hyperv.hints);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
 		rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
 		hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
 		lapic_timer_frequency = hv_lapic_frequency;
-		printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
-				lapic_timer_frequency);
+		pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+			lapic_timer_frequency);
 	}
 #endif
 
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index 316fe3e60a97..3d689937fc1b 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
 	 */
 	if (type != MTRR_TYPE_WRCOMB &&
 	    (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
-		pr_warning("mtrr: only write-combining%s supported\n",
+		pr_warn("mtrr: only write-combining%s supported\n",
 			   centaur_mcr_type ? " and uncacheable are" : " is");
 		return -EINVAL;
 	}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 0d98503c2245..31e951ce6dff 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -57,9 +57,9 @@ static int __initdata				nr_range;
 static struct var_mtrr_range_state __initdata	range_state[RANGE_NUM];
 
 static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
 
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
 	"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
 
 static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 						base, base + size);
 	}
 	if (debug_print) {
-		printk(KERN_DEBUG "After WB checking\n");
+		pr_debug("After WB checking\n");
 		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 	}
 
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
 		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
-			printk(BIOS_BUG_MSG, i);
+			pr_warn(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
 				continue;
 			size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 				 extra_remove_base + extra_remove_size);
 
 	if  (debug_print) {
-		printk(KERN_DEBUG "After UC checking\n");
+		pr_debug("After UC checking\n");
 		for (i = 0; i < RANGE_NUM; i++) {
 			if (!range[i].end)
 				continue;
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 		}
 	}
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 	/* sort the ranges */
 	nr_range = clean_sort_range(range, RANGE_NUM);
 	if  (debug_print) {
-		printk(KERN_DEBUG "After sorting\n");
+		pr_debug("After sorting\n");
 		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 	}
 
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
 		start_base = to_size_factor(start_base, &start_factor),
 		type = range_state[i].type;
 
-		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+		pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
 			i, start_base, start_factor,
 			size_base, size_factor,
 			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 		return 0;
 
 	/* Print original var MTRRs at first, for debugging: */
-	printk(KERN_DEBUG "original variable MTRRs\n");
+	pr_debug("original variable MTRRs\n");
 	print_out_mtrr_range_state();
 
 	memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 					  x_remove_base, x_remove_size);
 
 	range_sums = sum_ranges(range, nr_range);
-	printk(KERN_INFO "total RAM covered: %ldM\n",
+	pr_info("total RAM covered: %ldM\n",
 	       range_sums >> (20 - PAGE_SHIFT));
 
 	if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
 
 		if (!result[i].bad) {
 			set_var_mtrr_all(address_bits);
-			printk(KERN_DEBUG "New variable MTRRs\n");
+			pr_debug("New variable MTRRs\n");
 			print_out_mtrr_range_state();
 			return 1;
 		}
-		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-		       "will find optimal one\n");
+		pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
 	}
 
 	i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 				      x_remove_base, x_remove_size, i);
 			if (debug_print) {
 				mtrr_print_out_one_result(i);
-				printk(KERN_INFO "\n");
+				pr_info("\n");
 			}
 
 			i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 	index_good = mtrr_search_optimal_index();
 
 	if (index_good != -1) {
-		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		pr_info("Found optimal setting for mtrr clean up\n");
 		i = index_good;
 		mtrr_print_out_one_result(i);
 
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 		gran_size <<= 10;
 		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
 		set_var_mtrr_all(address_bits);
-		printk(KERN_DEBUG "New variable MTRRs\n");
+		pr_debug("New variable MTRRs\n");
 		print_out_mtrr_range_state();
 		return 1;
 	} else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
 			mtrr_print_out_one_result(i);
 	}
 
-	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+	pr_info("mtrr_cleanup: can not find optimal value\n");
+	pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
 
 	return 0;
 }
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
 	if (!highest_pfn) {
-		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+		pr_info("CPU MTRRs all blank - virtualized system.\n");
 		return 0;
 	}
 
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 							 end_pfn);
 
 	if (total_trim_size) {
-		pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+		pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
 
 		if (!changed_by_mtrr_cleanup)
 			WARN_ON(1);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index c870af161008..fcbcb2f678ca 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 
 	rdmsr(MSR_K8_SYSCFG, lo, hi);
 	if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
-		printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+		pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
 		       " not cleared by BIOS, clearing this bit\n",
 		       smp_processor_id());
 		lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
 	if (!mask)
 		return;
 	if (mask & MTRR_CHANGE_MASK_FIXED)
-		pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
 	if (mask & MTRR_CHANGE_MASK_VARIABLE)
-		pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
 	if (mask & MTRR_CHANGE_MASK_DEFTYPE)
-		pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
 
-	printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
-	printk(KERN_INFO "mtrr: corrected configuration.\n");
+	pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+	pr_info("mtrr: corrected configuration.\n");
 }
 
 /*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
 void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 {
 	if (wrmsr_safe(msr, a, b) < 0) {
-		printk(KERN_ERR
-			"MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+		pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
 			smp_processor_id(), msr, a, b);
 	}
 }
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 		tmp |= ~((1ULL<<(hi - 1)) - 1);
 
 		if (tmp != mask) {
-			printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+			pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
 			add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 			mask = tmp;
 		}
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
 	    boot_cpu_data.x86_model == 1 &&
 	    boot_cpu_data.x86_mask <= 7) {
 		if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
-			pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+			pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
 			return -EINVAL;
 		}
 		if (!(base + size < 0x70000 || base > 0x7003F) &&
 		    (type == MTRR_TYPE_WRCOMB
 		     || type == MTRR_TYPE_WRBACK)) {
-			pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+			pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
 			return -EINVAL;
 		}
 	}
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
 	     lbase = lbase >> 1, last = last >> 1)
 		;
 	if (lbase != last) {
-		pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+		pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
 		return -EINVAL;
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c3d149ee91c..ba80d68f683e 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		return error;
 
 	if (type >= MTRR_NUM_TYPES) {
-		pr_warning("mtrr: type: %u invalid\n", type);
+		pr_warn("mtrr: type: %u invalid\n", type);
 		return -EINVAL;
 	}
 
 	/* If the type is WC, check that this processor supports it */
 	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-		pr_warning("mtrr: your processor doesn't support write-combining\n");
+		pr_warn("mtrr: your processor doesn't support write-combining\n");
 		return -ENOSYS;
 	}
 
 	if (!size) {
-		pr_warning("mtrr: zero sized request\n");
+		pr_warn("mtrr: zero sized request\n");
 		return -EINVAL;
 	}
 
 	if ((base | (base + size - 1)) >>
 	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-		pr_warning("mtrr: base or size exceeds the MTRR width\n");
+		pr_warn("mtrr: base or size exceeds the MTRR width\n");
 		return -EINVAL;
 	}
 
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 				} else if (types_compatible(type, ltype))
 					continue;
 			}
-			pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+			pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
 				" 0x%lx000,0x%lx000\n", base, size, lbase,
 				lsize);
 			goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		if (ltype != type) {
 			if (types_compatible(type, ltype))
 				continue;
-			pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+			pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
 				base, size, mtrr_attrib_to_str(ltype),
 				mtrr_attrib_to_str(type));
 			goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 static int mtrr_check(unsigned long base, unsigned long size)
 {
 	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-		pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+		pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
 		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
 		dump_stack();
 		return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		}
 	}
 	if (reg >= max) {
-		pr_warning("mtrr: register: %d too big\n", reg);
+		pr_warn("mtrr: register: %d too big\n", reg);
 		goto out;
 	}
 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
 	if (lsize < 1) {
-		pr_warning("mtrr: MTRR %d not used\n", reg);
+		pr_warn("mtrr: MTRR %d not used\n", reg);
 		goto out;
 	}
 	if (mtrr_usage_table[reg] < 1) {
-		pr_warning("mtrr: reg: %d has count=0\n", reg);
+		pr_warn("mtrr: reg: %d has count=0\n", reg);
 		goto out;
 	}
 	if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b443db2db50..7402c8182813 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -254,15 +254,16 @@ static bool check_hw_exists(void)
 	 * We still allow the PMU driver to operate:
 	 */
 	if (bios_fail) {
-		printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
-		printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
+		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+			      reg_fail, val_fail);
 	}
 
 	return true;
 
 msr_fail:
-	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
-	printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+	pr_cont("Broken PMU hardware detected, using software events only.\n");
+	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
 		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
 		reg, val_new);
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 989d3c215d2b..aa12f9509cfb 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -670,7 +670,7 @@ static __init int perf_event_ibs_init(void)
 	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 
 	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
 	return 0;
 }
@@ -774,14 +774,14 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
 		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
 		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
 			pci_dev_put(cpu_cfg);
-			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-			       "IBSCTL = 0x%08x\n", value);
+			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+				 value);
 			return -EINVAL;
 		}
 	} while (1);
 
 	if (!nodes) {
-		printk(KERN_DEBUG "No CPU node configured for IBS\n");
+		pr_debug("No CPU node configured for IBS\n");
 		return -ENODEV;
 	}
 
@@ -810,7 +810,7 @@ static void force_ibs_eilvt_setup(void)
 	preempt_enable();
 
 	if (offset == APIC_EILVT_NR_MAX) {
-		printk(KERN_DEBUG "No EILVT entry available\n");
+		pr_debug("No EILVT entry available\n");
 		return;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 49742746a6c9..19a17363a21d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -536,7 +536,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_nb;
 
-		printk(KERN_INFO "perf: AMD NB counters detected\n");
+		pr_info("perf: AMD NB counters detected\n");
 		ret = 0;
 	}
 
@@ -550,7 +550,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_l2;
 
-		printk(KERN_INFO "perf: AMD L2I counters detected\n");
+		pr_info("perf: AMD L2I counters detected\n");
 		ret = 0;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 10602f0a438f..7c79261ed939 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1325,13 +1325,13 @@ void __init intel_ds_init(void)
 
 		switch (format) {
 		case 0:
-			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+			pr_cont("PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			break;
 
 		case 1:
-			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+			pr_cont("PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			break;
@@ -1351,7 +1351,7 @@ void __init intel_ds_init(void)
 			break;
 
 		default:
-			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
 		}
 	}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 819d94982e07..f6f50c4ceaec 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
 	for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
 		if (!rdrand_long(&tmp)) {
 			clear_cpu_cap(c, X86_FEATURE_RDRAND);
-			printk_once(KERN_WARNING "rdrand: disabled\n");
+			pr_warn_once("rdrand: disabled\n");
 			return;
 		}
 	}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4c60eaf0571c..cd531355e838 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
 
 	if (!printed) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		pr_info("CPU: Physical Processor ID: %d\n",
 		       c->phys_proc_id);
 		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+			pr_info("CPU: Processor Core ID: %d\n",
 			       c->cpu_core_id);
 		printed = 1;
 	}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 252da7aceca6..e3b4d1841175 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 	if (max >= 0x80860001) {
 		cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
 		if (cpu_rev != 0x02000000) {
-			printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+			pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
 				(cpu_rev >> 24) & 0xff,
 				(cpu_rev >> 16) & 0xff,
 				(cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 	if (max >= 0x80860002) {
 		cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
 		if (cpu_rev == 0x02000000) {
-			printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+			pr_info("CPU: Processor revision %08X, %u MHz\n",
 				new_cpu_rev, cpu_freq);
 		}
-		printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+		pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
 		       (cms_rev1 >> 24) & 0xff,
 		       (cms_rev1 >> 16) & 0xff,
 		       (cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 		      (void *)&cpu_info[56],
 		      (void *)&cpu_info[60]);
 		cpu_info[64] = '\0';
-		printk(KERN_INFO "CPU: %s\n", cpu_info);
+		pr_info("CPU: %s\n", cpu_info);
 	}
 
 	/* Unhide possibly hidden capability flags */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 628a059a9a06..364e58346897 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	tsc_hz = eax | (((uint64_t)ebx) << 32);
 	do_div(tsc_hz, 1000);
 	BUG_ON(tsc_hz >> 32);
-	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+	pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
 			 (unsigned long) tsc_hz / 1000,
 			 (unsigned long) tsc_hz % 1000);
 
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
 	if (ebx != UINT_MAX)
 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
 	else
-		printk(KERN_WARNING
-		       "Failed to get TSC freq from the hypervisor\n");
+		pr_warn("Failed to get TSC freq from the hypervisor\n");
 }
 
 /*
-- 
cgit 


From ca0e30dcaa53a3fcb2dfdf74252d30bc40603eea Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 1 Feb 2016 22:06:58 +0000
Subject: efi: Add nonblocking option to efi_query_variable_store()

The function efi_query_variable_store() may be invoked by
efivar_entry_set_nonblocking(), which itself takes care to only
call a non-blocking version of the SetVariable() runtime
wrapper. However, efi_query_variable_store() may call the
SetVariable() wrapper directly, as well as the wrapper for
QueryVariableInfo(), both of which could deadlock in the same
way we are trying to prevent by calling
efivar_entry_set_nonblocking() in the first place.

So instead, modify efi_query_variable_store() to use the
non-blocking variants of QueryVariableInfo() (and give up rather
than free up space if the available space is below
EFI_MIN_RESERVE) if invoked with the 'nonblocking' argument set
to true.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1454364428-494-5-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 453504662a33..2326bf51978f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -56,6 +56,33 @@ void efi_delete_dummy_variable(void)
 			 0, NULL);
 }
 
+/*
+ * In the nonblocking case we do not attempt to perform garbage
+ * collection if we do not have enough free space. Rather, we do the
+ * bare minimum check and give up immediately if the available space
+ * is below EFI_MIN_RESERVE.
+ *
+ * This function is intended to be small and simple because it is
+ * invoked from crash handler paths.
+ */
+static efi_status_t
+query_variable_store_nonblocking(u32 attributes, unsigned long size)
+{
+	efi_status_t status;
+	u64 storage_size, remaining_size, max_size;
+
+	status = efi.query_variable_info_nonblocking(attributes, &storage_size,
+						     &remaining_size,
+						     &max_size);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	if (remaining_size - size < EFI_MIN_RESERVE)
+		return EFI_OUT_OF_RESOURCES;
+
+	return EFI_SUCCESS;
+}
+
 /*
  * Some firmware implementations refuse to boot if there's insufficient space
  * in the variable store. Ensure that we never use more than a safe limit.
@@ -63,7 +90,8 @@ void efi_delete_dummy_variable(void)
  * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
  * store.
  */
-efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
+				      bool nonblocking)
 {
 	efi_status_t status;
 	u64 storage_size, remaining_size, max_size;
@@ -71,6 +99,9 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
 	if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
 		return 0;
 
+	if (nonblocking)
+		return query_variable_store_nonblocking(attributes, size);
+
 	status = efi.query_variable_info(attributes, &storage_size,
 					 &remaining_size, &max_size);
 	if (status != EFI_SUCCESS)
-- 
cgit 


From 66dbe99cfe30e113d2e571e68b9b6a1a8985a157 Mon Sep 17 00:00:00 2001
From: Môshe van der Sterre <me@moshe.nl>
Date: Mon, 1 Feb 2016 22:07:03 +0000
Subject: x86/efi/bgrt: Don't ignore the BGRT if the 'valid' bit is 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unintuitively, the BGRT graphic is apparently meant to be usable
if the valid bit in not set. The valid bit only conveys
uncertainty about the validity in relation to the screen state.

Windows 10 actually uses the BGRT image for its boot screen even
if not 'valid', for example when the user triggered the boot
menu. Because it is unclear if all firmwares will provide a
usable graphic in this case, we now look at the BMP magic number
as an additional check.

Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Môshe van der Sterre <me@moshe.nl>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: =?UTF-8?q?M=C3=B4she=20van=20der=20Sterre?= <me@moshe.nl>
Link: http://lkml.kernel.org/r/1454364428-494-10-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi-bgrt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index b0970661870a..a2433817c987 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -57,11 +57,6 @@ void __init efi_bgrt_init(void)
 		       bgrt_tab->status);
 		return;
 	}
-	if (bgrt_tab->status != 1) {
-		pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n",
-			 bgrt_tab->status);
-		return;
-	}
 	if (bgrt_tab->image_type != 0) {
 		pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
 		       bgrt_tab->image_type);
@@ -80,6 +75,11 @@ void __init efi_bgrt_init(void)
 
 	memcpy(&bmp_header, image, sizeof(bmp_header));
 	memunmap(image);
+	if (bmp_header.id != 0x4d42) {
+		pr_err("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n",
+			bmp_header.id);
+		return;
+	}
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
-- 
cgit 


From 1e82b94790709fb2a22d16d53bb04d751fb3878d Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hpe.com>
Date: Mon, 1 Feb 2016 22:07:05 +0000
Subject: x86/efi: Show actual ending addresses in efi_print_memmap

Adjust efi_print_memmap to print the real end address of each
range, not 1 byte beyond. This matches other prints like those
for SRAT and nosave memory.

While investigating grub persistent memory corruption issues, it
was helpful to make this table match the ending address
convention used by:
* the kernel's e820 table prints
	BIOS-e820: [mem 0x0000001680000000-0x0000001c7fffffff] reserved
* the kernel's nosave memory prints
	PM: Registered nosave memory: [mem 0x880000000-0xc7fffffff]
* the kernel's ACPI System Resource Affinity Table prints
	SRAT: Node 1 PXM 1 [mem 0x480000000-0x87fffffff]
* grub's lsmmap and lsefimmap commands
	reserved  0000001680000000-0000001c7fffffff 00600000     24GiB UC WC WT WB NV
* the UEFI shell's memmap command
	Reserved   000000007FC00000-000000007FFFFFFF 0000000000000400 0000000000000001

For example, if you grep all the various logs for c7fffffff, you
won't find the kernel's line if it uses c80000000.

Also, change the closing ) to ] to match the opening [.

old:
    efi: mem61: [Persistent Memory  |   |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000880000000-0x0000000c80000000) (16384MB)

new:
    efi: mem61: [Persistent Memory  |   |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000880000000-0x0000000c7fffffff] (16384MB)

Signed-off-by: Robert Elliott <elliott@hpe.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Leif Lindholm <leif.lindholm@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1454364428-494-12-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index bdd9477f937c..e80826e6f3a9 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -235,10 +235,10 @@ void __init efi_print_memmap(void)
 		char buf[64];
 
 		md = p;
-		pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n",
+		pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n",
 			i, efi_md_typeattr_format(buf, sizeof(buf), md),
 			md->phys_addr,
-			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1,
 			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
 	}
 #endif  /*  EFI_DEBUG  */
-- 
cgit 


From 02afeaae9843733a39cd9b11053748b2d1dc5ae7 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:38 -0800
Subject: x86/boot: Fix early command-line parsing when matching at end

The x86 early command line parsing in cmdline_find_option_bool() is
buggy. If it matches a specified 'option' all the way to the end of the
command-line, it will consider it a match.

For instance,

  cmdline = "foo";
  cmdline_find_option_bool(cmdline, "fool");

will return 1. This is particularly annoying since we have actual FPU
options like "noxsave" and "noxsaves" So, command-line "foo bar noxsave"
will match *BOTH* a "noxsave" and "noxsaves". (This turns out not to be
an actual problem because "noxsave" implies "noxsaves", but it's still
confusing.)

To fix this, we simplify the code and stop tracking 'len'. 'len'
was trying to indicate either the NULL terminator *OR* the end of a
non-NULL-terminated command line at 'COMMAND_LINE_SIZE'. But, each of the
three states is *already* checking 'cmdline' for a NULL terminator.

We _only_ need to check if we have overrun 'COMMAND_LINE_SIZE', and that
we can do without keeping 'len' around.

Also add some commends to clarify what is going on.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225238.9AEB560C@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/cmdline.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..49548bed2301 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -21,12 +21,14 @@ static inline int myisspace(u8 c)
  * @option: option string to look for
  *
  * Returns the position of that @option (starts counting with 1)
- * or 0 on not found.
+ * or 0 on not found.  @option will only be found if it is found
+ * as an entire word in @cmdline.  For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
  */
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
 	char c;
-	int len, pos = 0, wstart = 0;
+	int pos = 0, wstart = 0;
 	const char *opptr = NULL;
 	enum {
 		st_wordstart = 0,	/* Start of word/after whitespace */
@@ -37,11 +39,14 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	if (!cmdline)
 		return -1;      /* No command line */
 
-	len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
-	if (!len)
+	if (!strlen(cmdline))
 		return 0;
 
-	while (len--) {
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos < COMMAND_LINE_SIZE) {
 		c = *(char *)cmdline++;
 		pos++;
 
@@ -58,17 +63,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 			/* fall through */
 
 		case st_wordcmp:
-			if (!*opptr)
+			if (!*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for.  If the
+				 * command-line has a space _or_ ends, then
+				 * we matched!
+				 */
 				if (!c || myisspace(c))
 					return wstart;
 				else
 					state = st_wordskip;
-			else if (!c)
+			} else if (!c) {
+				/*
+				 * Hit the NULL terminator on the end of
+				 * cmdline.
+				 */
 				return 0;
-			else if (c != *opptr++)
+			} else if (c != *opptr++) {
 				state = st_wordskip;
-			else if (!len)		/* last word and is matching */
-				return wstart;
+			}
 			break;
 
 		case st_wordskip:
-- 
cgit 


From abcdc1c694fa4055323cbec1cde4c2cb6b68398c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:39 -0800
Subject: x86/boot: Fix early command-line parsing when partial word matches

cmdline_find_option_bool() keeps track of position in two strings:

 1. the command-line
 2. the option we are searchign for in the command-line

We plow through each character in the command-line one at a time, always
moving forward. We move forward in the option ('opptr') when we match
characters in 'cmdline'. We reset the 'opptr' only when we go in to the
'st_wordstart' state.

But, if we fail to match an option because we see a space
(state=st_wordcmp, *opptr='\0',c=' '), we set state='st_wordskip' and
'break', moving to the next character. But, that move to the next
character is the one *after* the ' '. This means that we will miss a
'st_wordstart' state.

For instance, if we have

  cmdline = "foo fool";

and are searching for "fool", we have:

	  "fool"
  opptr = ----^

           "foo fool"
   c = --------^

We see that 'l' != ' ', set state=st_wordskip, break, and then move 'c', so:

          "foo fool"
  c = ---------^

and are still in state=st_wordskip. We will stay in wordskip until we
have skipped "fool", thus missing the option we were looking for. This
*only* happens when you have a partially- matching word followed by a
matching one.

To fix this, we always fall *into* the 'st_wordskip' state when we set
it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225239.8E1DCA58@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/cmdline.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 49548bed2301..ff8d1beead6c 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -72,18 +72,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 				 */
 				if (!c || myisspace(c))
 					return wstart;
-				else
-					state = st_wordskip;
+				/*
+				 * We hit the end of the option, but _not_
+				 * the end of a word on the cmdline.  Not
+				 * a match.
+				 */
 			} else if (!c) {
 				/*
 				 * Hit the NULL terminator on the end of
 				 * cmdline.
 				 */
 				return 0;
-			} else if (c != *opptr++) {
-				state = st_wordskip;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
 			}
-			break;
+			state = st_wordskip;
+			/* fall through */
 
 		case st_wordskip:
 			if (!c)
-- 
cgit 


From 4de07ea481361b08fe13735004dafae862482d38 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:41 -0800
Subject: x86/boot: Simplify early command line parsing

__cmdline_find_option_bool() tries to account for both NULL-terminated
and non-NULL-terminated strings. It keeps 'pos' to look for the end of
the buffer and also looks for '!c' in a bunch of places to look for NULL
termination.

But, it also calls strlen(). You can't call strlen on a
non-NULL-terminated string.

If !strlen(cmdline), then cmdline[0]=='\0'. In that case, we will go in
to the while() loop, set c='\0', hit st_wordstart, notice !c, and will
immediately return 0.

So, remove the strlen().  It is unnecessary and unsafe.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225241.15365E43@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/cmdline.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index ff8d1beead6c..945a639c02dd 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -39,9 +39,6 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	if (!cmdline)
 		return -1;      /* No command line */
 
-	if (!strlen(cmdline))
-		return 0;
-
 	/*
 	 * This 'pos' check ensures we do not overrun
 	 * a non-NULL-terminated 'cmdline'
-- 
cgit 


From 8c0517759a1a100a8b83134cf3c7f254774aaeba Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:43 -0800
Subject: x86/boot: Pass in size to early cmdline parsing

We will use this in a few patches to implement tests for early parsing.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
[ Aligned args properly. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225243.5CC47EB6@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/cmdline.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 945a639c02dd..5cc78bf57232 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -25,7 +25,9 @@ static inline int myisspace(u8 c)
  * as an entire word in @cmdline.  For instance, if @option="car"
  * then a cmdline which contains "cart" will not match.
  */
-int cmdline_find_option_bool(const char *cmdline, const char *option)
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+			   const char *option)
 {
 	char c;
 	int pos = 0, wstart = 0;
@@ -43,7 +45,7 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	 * This 'pos' check ensures we do not overrun
 	 * a non-NULL-terminated 'cmdline'
 	 */
-	while (pos < COMMAND_LINE_SIZE) {
+	while (pos < max_cmdline_size) {
 		c = *(char *)cmdline++;
 		pos++;
 
@@ -101,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 
 	return 0;	/* Buffer overrun */
 }
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+	return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
-- 
cgit 


From 5bd28338d681dcbde2c4bee4ebea0c4e0dfcd9e4 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 5 Feb 2016 14:57:54 -0600
Subject: PCI: Remove includes of empty asm-generic/pci-bridge.h

include/asm-generic/pci-bridge.h is now empty, so remove every #include of
it.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Will Deacon <will.deacon@arm.com> (arm64)
---
 arch/x86/pci/common.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2879efc73a96..b4a9f23d77d9 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -12,7 +12,6 @@
 #include <linux/dmi.h>
 #include <linux/slab.h>
 
-#include <asm-generic/pci-bridge.h>
 #include <asm/acpi.h>
 #include <asm/segment.h>
 #include <asm/io.h>
-- 
cgit 


From fd09967b830c9ed60f53f318ee67907803065c9c Mon Sep 17 00:00:00 2001
From: "Wang, Rui Y" <rui.y.wang@intel.com>
Date: Tue, 2 Feb 2016 21:56:45 +0800
Subject: crypto: sha-mb - Fix load failure

On  Monday, February 1, 2016 4:18 PM, Herbert Xu wrote:
>
> On Wed, Jan 27, 2016 at 05:08:35PM +0800, Rui Wang wrote:
>>
>> +static int sha1_mb_async_import(struct ahash_request *req, const void
>> +*in) {
>> +	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
>> +	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
>> +	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
>> +	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
>> +	struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
>> +	struct mcryptd_hash_request_ctx *rctx;
>> +	struct shash_desc *desc;
>> +	int err;
>> +
>> +	memcpy(mcryptd_req, req, sizeof(*req));
>> +	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
>> +	rctx = ahash_request_ctx(mcryptd_req);
>> +	desc = &rctx->desc;
>> +	desc->tfm = child;
>> +	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
>> +
>> +	err = crypto_shash_init(desc);
>> +	if (err)
>> +		return err;
>
> What is this desc for?

Hi Herbert,

Yeah I just realized that the call to crypto_shash_init() isn't necessary
here. What it does is overwritten by crypto_ahash_import(). But this desc
still needs to be initialized here because it's newly allocated by
ahash_request_alloc(). We eventually calls the shash version of import()
which needs desc as an argument. The real context to be imported is then
derived from shash_desc_ctx(desc).

desc is a sub-field of struct mcryptd_hash_request_ctx, which is again a
sub-field of the bigger blob allocated by ahash_request_alloc(). The entire
blob's size is set in sha1_mb_async_init_tfm(). So a better version is as
follows:

(just removed the call to crypto_shash_init())

>From 4bcb73adbef99aada94c49f352063619aa24d43d Mon Sep 17 00:00:00 2001
From: Rui Wang <rui.y.wang@intel.com>
Date: Mon, 14 Dec 2015 17:22:13 +0800
Subject: [PATCH v2 1/4] crypto x86/sha1_mb: Fix load failure

modprobe sha1_mb fails with the following message:

modprobe: ERROR: could not insert 'sha1_mb': No such device

It is because it needs to set its statesize and implement its
import() and export() interface.

v2: remove redundant call to crypto_shash_init()

Signed-off-by: Rui Wang <rui.y.wang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index a841e9765bd6..a8a0224fa0f8 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -762,6 +762,38 @@ static int sha1_mb_async_digest(struct ahash_request *req)
 	return crypto_ahash_digest(mcryptd_req);
 }
 
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct shash_desc *desc;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+	desc = &rctx->desc;
+	desc->tfm = child;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
 static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
 {
 	struct mcryptd_ahash *mcryptd_tfm;
@@ -796,8 +828,11 @@ static struct ahash_alg sha1_mb_async_alg = {
 	.final          = sha1_mb_async_final,
 	.finup          = sha1_mb_async_finup,
 	.digest         = sha1_mb_async_digest,
+	.export		= sha1_mb_async_export,
+	.import		= sha1_mb_async_import,
 	.halg = {
 		.digestsize     = SHA1_DIGEST_SIZE,
+		.statesize	= sizeof(struct sha1_hash_ctx),
 		.base = {
 			.cra_name               = "sha1",
 			.cra_driver_name        = "sha1_mb",
-- 
cgit 


From c1a0bf347c40dd4b0a5bb10fdf4de76a1fbbbe8c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Feb 2016 09:57:34 +0100
Subject: x86/mm/numa: Clean up numa_clear_kernel_node_hotplug()

So we fixed an overflow bug in numa_clear_kernel_node_hotplug():

  2b54ab3c66d4 ("x86/mm/numa: Fix memory corruption on 32-bit NUMA kernels")

... and the bug was indirectly caused by poor coding style,
such as using start/end local variables unnecessarily, which
lost the physaddr_t type.

So make the code more readable and try to fully comment all
the thinking behind the logic.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Chen Tang <imtangchen@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: y14sg1 <y14sg1@comcast.net>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/numa.c | 65 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index d04f8094bc23..ede4506abb18 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -465,46 +465,65 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 	return true;
 }
 
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
 static void __init numa_clear_kernel_node_hotplug(void)
 {
-	int i, nid;
-	nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
-	phys_addr_t start, end;
-	struct memblock_region *r;
+	nodemask_t reserved_nodemask = NODE_MASK_NONE;
+	struct memblock_region *mb_region;
+	int i;
 
 	/*
+	 * We have to do some preprocessing of memblock regions, to
+	 * make them suitable for reservation.
+	 *
 	 * At this time, all memory regions reserved by memblock are
-	 * used by the kernel. Set the nid in memblock.reserved will
-	 * mark out all the nodes the kernel resides in.
+	 * used by the kernel, but those regions are not split up
+	 * along node boundaries yet, and don't necessarily have their
+	 * node ID set yet either.
+	 *
+	 * So iterate over all memory known to the x86 architecture,
+	 * and use those ranges to set the nid in memblock.reserved.
+	 * This will split up the memblock regions along node
+	 * boundaries and will set the node IDs as well.
 	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-		struct numa_memblk *mb = &numa_meminfo.blk[i];
+		struct numa_memblk *mb = numa_meminfo.blk + i;
 
-		memblock_set_node(mb->start, mb->end - mb->start,
-				  &memblock.reserved, mb->nid);
+		memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
 	}
 
 	/*
-	 * Mark all kernel nodes.
+	 * Now go over all reserved memblock regions, to construct a
+	 * node mask of all kernel reserved memory areas.
 	 *
-	 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
-	 * may not include all the memblock.reserved memory ranges because
-	 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+	 *   numa_meminfo might not include all memblock.reserved
+	 *   memory ranges, because quirks such as trim_snb_memory()
+	 *   reserve specific pages for Sandy Bridge graphics. ]
 	 */
-	for_each_memblock(reserved, r)
-		if (r->nid != MAX_NUMNODES)
-			node_set(r->nid, numa_kernel_nodes);
+	for_each_memblock(reserved, mb_region) {
+		if (mb_region->nid != MAX_NUMNODES)
+			node_set(mb_region->nid, reserved_nodemask);
+	}
 
-	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+	/*
+	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+	 * belonging to the reserved node mask.
+	 *
+	 * Note that this will include memory regions that reside
+	 * on nodes that contain kernel memory - entire nodes
+	 * become hot-unpluggable:
+	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-		nid = numa_meminfo.blk[i].nid;
-		if (!node_isset(nid, numa_kernel_nodes))
-			continue;
+		struct numa_memblk *mb = numa_meminfo.blk + i;
 
-		start = numa_meminfo.blk[i].start;
-		end = numa_meminfo.blk[i].end;
+		if (!node_isset(mb->nid, reserved_nodemask))
+			continue;
 
-		memblock_clear_hotplug(start, end - start);
+		memblock_clear_hotplug(mb->start, mb->end - mb->start);
 	}
 }
 
-- 
cgit 


From 5f7ee246850ba18a6a7bcb3d5eddb6db68354688 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Feb 2016 12:14:55 +0100
Subject: x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug()

numa_clear_kernel_node_hotplug() uses memblock_set_node() without
checking for failures.

memblock_set_node() is a complex function that might extend the
memblock array - which extension might fail - so check for this
possibility.

It's not supposed to happen (because realistically if we have so
little memory that this fails then we likely won't be able to
boot anyway), but do the check nevertheless.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Chen Tang <imtangchen@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: y14sg1 <y14sg1@comcast.net>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/numa.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ede4506abb18..f70c1ff46125 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -491,8 +491,10 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
 		struct numa_memblk *mb = numa_meminfo.blk + i;
+		int ret;
 
-		memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+		WARN_ON_ONCE(ret);
 	}
 
 	/*
-- 
cgit 


From fa9cbf320e996eaa3d219344b6f7013b096cafd9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:04 +0100
Subject: perf/x86: Move perf_event.c ............... => x86/events/core.c

Also, keep the churn at minimum by adjusting the include "perf_event.h"
when each file gets moved.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kbuild                  |    3 +-
 arch/x86/events/Makefile         |    1 +
 arch/x86/events/core.c           | 2429 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile     |    2 -
 arch/x86/kernel/cpu/perf_event.c | 2429 --------------------------------------
 5 files changed, 2432 insertions(+), 2432 deletions(-)
 create mode 100644 arch/x86/events/Makefile
 create mode 100644 arch/x86/events/core.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 1538562cc720..eb3abf8ac44e 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,6 +1,7 @@
-
 obj-y += entry/
 
+obj-$(CONFIG_PERF_EVENTS) += events/
+
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644
index 000000000000..3fad3ce1bbab
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1 @@
+obj-y			+= core.o
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
new file mode 100644
index 000000000000..90ca601b5d5c
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,2429 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+
+#include "../kernel/cpu/perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+	.enabled = 1,
+};
+
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
+u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - x86_pmu.cntval_bits;
+	u64 prev_raw_count, new_raw_count;
+	int idx = hwc->idx;
+	s64 delta;
+
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+	struct extra_reg *er;
+
+	reg = &event->hw.extra_reg;
+
+	if (!x86_pmu.extra_regs)
+		return 0;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+		/* Check if the extra msrs can be safely accessed*/
+		if (!er->extra_msr_access)
+			return -ENXIO;
+
+		reg->idx = er->idx;
+		reg->config = event->attr.config1;
+		reg->reg = er->msr;
+		break;
+	}
+	return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(x86_pmu_config_addr(i));
+
+	i = x86_pmu.num_counters;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(x86_pmu_event_addr(i));
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		release_perfctr_nmi(x86_pmu_event_addr(i));
+		release_evntsel_nmi(x86_pmu_config_addr(i));
+	}
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+	u64 val, val_fail, val_new= ~0;
+	int i, reg, reg_fail, ret = 0;
+	int bios_fail = 0;
+	int reg_safe = -1;
+
+	/*
+	 * Check to see if the BIOS enabled any of the counters, if so
+	 * complain and bail.
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		reg = x86_pmu_config_addr(i);
+		ret = rdmsrl_safe(reg, &val);
+		if (ret)
+			goto msr_fail;
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+			bios_fail = 1;
+			val_fail = val;
+			reg_fail = reg;
+		} else {
+			reg_safe = i;
+		}
+	}
+
+	if (x86_pmu.num_counters_fixed) {
+		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		ret = rdmsrl_safe(reg, &val);
+		if (ret)
+			goto msr_fail;
+		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+			if (val & (0x03 << i*4)) {
+				bios_fail = 1;
+				val_fail = val;
+				reg_fail = reg;
+			}
+		}
+	}
+
+	/*
+	 * If all the counters are enabled, the below test will always
+	 * fail.  The tools will also become useless in this scenario.
+	 * Just fail and disable the hardware counters.
+	 */
+
+	if (reg_safe == -1) {
+		reg = reg_safe;
+		goto msr_fail;
+	}
+
+	/*
+	 * Read the current value, change it and read it back to see if it
+	 * matches, this is needed to detect certain hardware emulators
+	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+	 */
+	reg = x86_pmu_event_addr(reg_safe);
+	if (rdmsrl_safe(reg, &val))
+		goto msr_fail;
+	val ^= 0xffffUL;
+	ret = wrmsrl_safe(reg, val);
+	ret |= rdmsrl_safe(reg, &val_new);
+	if (ret || val != val_new)
+		goto msr_fail;
+
+	/*
+	 * We still allow the PMU driver to operate:
+	 */
+	if (bios_fail) {
+		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+			      reg_fail, val_fail);
+	}
+
+	return true;
+
+msr_fail:
+	pr_cont("Broken PMU hardware detected, using software events only.\n");
+	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+		reg, val_new);
+
+	return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	x86_release_hardware();
+	atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+	return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+	int err = 0;
+
+	if (!atomic_inc_not_zero(&pmc_refcount)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&pmc_refcount) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				reserve_ds_buffers();
+		}
+		if (!err)
+			atomic_inc(&pmc_refcount);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	return err;
+}
+
+void x86_release_hardware(void)
+{
+	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		release_ds_buffers();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int i;
+
+	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+		mutex_lock(&pmc_reserve_mutex);
+		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+				goto fail_unlock;
+		}
+		atomic_inc(&x86_pmu.lbr_exclusive[what]);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	atomic_inc(&active_events);
+	return 0;
+
+fail_unlock:
+	mutex_unlock(&pmc_reserve_mutex);
+	return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+	atomic_dec(&active_events);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	if (!is_sampling_event(event)) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		return x86_pmu_extra_regs(event->attr.config, event);
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, event);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !attr->freq && hwc->sample_period == 1) {
+		/* BTS is not supported by this architecture. */
+		if (!x86_pmu.bts_active)
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (!attr->exclude_kernel)
+			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+	u64 m = event->attr.branch_sample_type;
+	u64 b = 0;
+
+	/* must capture all branches */
+	if (!(m & PERF_SAMPLE_BRANCH_ANY))
+		return 0;
+
+	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_user)
+		b |= PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_kernel)
+		b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+	/*
+	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+	 */
+
+	return m == b;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+	if (event->attr.precise_ip) {
+		int precise = 0;
+
+		/* Support for constant skid */
+		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+			precise++;
+
+			/* Support for IP fixup */
+			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+				precise++;
+
+			if (x86_pmu.pebs_prec_dist)
+				precise++;
+		}
+
+		if (event->attr.precise_ip > precise)
+			return -EOPNOTSUPP;
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
+
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
+
+			/* branch_sample_type is compatible */
+
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+		}
+	}
+
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to
+	 */
+	if (!event->attr.exclude_user)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!event->attr.exclude_kernel)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
+	return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+	int err;
+
+	if (!x86_pmu_initialized())
+		return -ENODEV;
+
+	err = x86_reserve_hardware();
+	if (err)
+		return err;
+
+	atomic_inc(&active_events);
+	event->destroy = hw_perf_event_destroy;
+
+	event->hw.idx = -1;
+	event->hw.last_cpu = -1;
+	event->hw.last_tag = ~0ULL;
+
+	/* mark unused */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+	return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(x86_pmu_config_addr(idx), val);
+		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		wrmsrl(x86_pmu_config_addr(idx), val);
+	}
+}
+
+static void x86_pmu_disable(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!x86_pmu_initialized())
+		return;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->n_added = 0;
+	cpuc->enabled = 0;
+	barrier();
+
+	x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	}
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+	return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+	int	weight;
+	int	event;		/* event index */
+	int	counter;	/* counter index */
+	int	unassigned;	/* number of events to be assigned left */
+	int	nr_gp;		/* number of GP counters used */
+	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define	SCHED_STATES_MAX	2
+
+struct perf_sched {
+	int			max_weight;
+	int			max_events;
+	int			max_gp;
+	int			saved_states;
+	struct event_constraint	**constraints;
+	struct sched_state	state;
+	struct sched_state	saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+			    int num, int wmin, int wmax, int gpmax)
+{
+	int idx;
+
+	memset(sched, 0, sizeof(*sched));
+	sched->max_events	= num;
+	sched->max_weight	= wmax;
+	sched->max_gp		= gpmax;
+	sched->constraints	= constraints;
+
+	for (idx = 0; idx < num; idx++) {
+		if (constraints[idx]->weight == wmin)
+			break;
+	}
+
+	sched->state.event	= idx;		/* start with min weight */
+	sched->state.weight	= wmin;
+	sched->state.unassigned	= num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+		return;
+
+	sched->saved[sched->saved_states] = sched->state;
+	sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+	if (!sched->saved_states)
+		return false;
+
+	sched->saved_states--;
+	sched->state = sched->saved[sched->saved_states];
+
+	/* continue with next counter: */
+	clear_bit(sched->state.counter++, sched->state.used);
+
+	return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+	struct event_constraint *c;
+	int idx;
+
+	if (!sched->state.unassigned)
+		return false;
+
+	if (sched->state.event >= sched->max_events)
+		return false;
+
+	c = sched->constraints[sched->state.event];
+	/* Prefer fixed purpose counters */
+	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+		idx = INTEL_PMC_IDX_FIXED;
+		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+			if (!__test_and_set_bit(idx, sched->state.used))
+				goto done;
+		}
+	}
+
+	/* Grab the first unused counter starting with idx */
+	idx = sched->state.counter;
+	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+		if (!__test_and_set_bit(idx, sched->state.used)) {
+			if (sched->state.nr_gp++ >= sched->max_gp)
+				return false;
+
+			goto done;
+		}
+	}
+
+	return false;
+
+done:
+	sched->state.counter = idx;
+
+	if (c->overlap)
+		perf_sched_save_state(sched);
+
+	return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+	while (!__perf_sched_find_counter(sched)) {
+		if (!perf_sched_restore_state(sched))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+	struct event_constraint *c;
+
+	if (!sched->state.unassigned || !--sched->state.unassigned)
+		return false;
+
+	do {
+		/* next event */
+		sched->state.event++;
+		if (sched->state.event >= sched->max_events) {
+			/* next weight */
+			sched->state.event = 0;
+			sched->state.weight++;
+			if (sched->state.weight > sched->max_weight)
+				return false;
+		}
+		c = sched->constraints[sched->state.event];
+	} while (c->weight != sched->state.weight);
+
+	sched->state.counter = 0;	/* start with first counter */
+
+	return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign)
+{
+	struct perf_sched sched;
+
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+	do {
+		if (!perf_sched_find_counter(&sched))
+			break;	/* failed */
+		if (assign)
+			assign[sched.state.event] = sched.state.counter;
+	} while (perf_sched_next_event(&sched));
+
+	return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	struct event_constraint *c;
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	struct perf_event *e;
+	int i, wmin, wmax, unsched = 0;
+	struct hw_perf_event *hwc;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
+	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		cpuc->event_constraint[i] = NULL;
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+		cpuc->event_constraint[i] = c;
+
+		wmin = min(wmin, c->weight);
+		wmax = max(wmax, c->weight);
+	}
+
+	/*
+	 * fastpath, try to reuse previous register
+	 */
+	for (i = 0; i < n; i++) {
+		hwc = &cpuc->event_list[i]->hw;
+		c = cpuc->event_constraint[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		/* not already used */
+		if (test_bit(hwc->idx, used_mask))
+			break;
+
+		__set_bit(hwc->idx, used_mask);
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+
+	/* slow path */
+	if (i != n) {
+		int gpmax = x86_pmu.num_counters;
+
+		/*
+		 * Do not allow scheduling of more than half the available
+		 * generic counters.
+		 *
+		 * This helps avoid counter starvation of sibling thread by
+		 * ensuring at most half the counters cannot be in exclusive
+		 * mode. There is no designated counters for the limits. Any
+		 * N/2 counters can be used. This helps with events with
+		 * specific counter constraints.
+		 */
+		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+			gpmax /= 2;
+
+		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+					     wmax, gpmax, assign);
+	}
+
+	/*
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
+	 */
+	if (!unsched && assign) {
+		for (i = 0; i < n; i++) {
+			e = cpuc->event_list[i];
+			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+		}
+	} else {
+		for (i = 0; i < n; i++) {
+			e = cpuc->event_list[i];
+			/*
+			 * do not put_constraint() on comitted events,
+			 * because they are good to go
+			 */
+			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+				continue;
+
+			/*
+			 * release events that failed scheduling
+			 */
+			if (x86_pmu.put_event_constraints)
+				x86_pmu.put_event_constraints(cpuc, e);
+		}
+	}
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
+	return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+	/* current number of events already accepted */
+	n = cpuc->n_events;
+
+	if (is_x86_event(leader)) {
+		if (n >= max_count)
+			return -EINVAL;
+		cpuc->event_list[n] = leader;
+		n++;
+	}
+	if (!dogrp)
+		return n;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+		if (!is_x86_event(event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		cpuc->event_list[n] = event;
+		n++;
+	}
+	return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+				struct cpu_hw_events *cpuc, int i)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = cpuc->assign[i];
+	hwc->last_cpu = smp_processor_id();
+	hwc->last_tag = ++cpuc->tags[i];
+
+	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+		hwc->config_base = 0;
+		hwc->event_base	= 0;
+	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+	} else {
+		hwc->config_base = x86_pmu_config_addr(hwc->idx);
+		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+	}
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+					struct cpu_hw_events *cpuc,
+					int i)
+{
+	return hwc->idx == cpuc->assign[i] &&
+		hwc->last_cpu == smp_processor_id() &&
+		hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int i, added = cpuc->n_added;
+
+	if (!x86_pmu_initialized())
+		return;
+
+	if (cpuc->enabled)
+		return;
+
+	if (cpuc->n_added) {
+		int n_running = cpuc->n_events - cpuc->n_added;
+		/*
+		 * apply assignment obtained either from
+		 * hw_perf_group_sched_in() or x86_pmu_enable()
+		 *
+		 * step1: save events moving to new counters
+		 */
+		for (i = 0; i < n_running; i++) {
+			event = cpuc->event_list[i];
+			hwc = &event->hw;
+
+			/*
+			 * we can avoid reprogramming counter if:
+			 * - assigned same counter as last time
+			 * - running on same CPU as last time
+			 * - no other event has used the counter since
+			 */
+			if (hwc->idx == -1 ||
+			    match_prev_assignment(hwc, cpuc, i))
+				continue;
+
+			/*
+			 * Ensure we don't accidentally enable a stopped
+			 * counter simply because we rescheduled.
+			 */
+			if (hwc->state & PERF_HES_STOPPED)
+				hwc->state |= PERF_HES_ARCH;
+
+			x86_pmu_stop(event, PERF_EF_UPDATE);
+		}
+
+		/*
+		 * step2: reprogram moved events into new counters
+		 */
+		for (i = 0; i < cpuc->n_events; i++) {
+			event = cpuc->event_list[i];
+			hwc = &event->hw;
+
+			if (!match_prev_assignment(hwc, cpuc, i))
+				x86_assign_hw_event(event, cpuc, i);
+			else if (i < n_running)
+				continue;
+
+			if (hwc->state & PERF_HES_ARCH)
+				continue;
+
+			x86_pmu_start(event, PERF_EF_RELOAD);
+		}
+		cpuc->n_added = 0;
+		perf_events_lapic_init();
+	}
+
+	cpuc->enabled = 1;
+	barrier();
+
+	x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0, idx = hwc->idx;
+
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
+
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
+	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    local64_read(&hwc->prev_count) != (u64)-left) {
+		/*
+		 * The hw event starts counting from this event offset,
+		 * mark it to be able to extra future deltas:
+		 */
+		local64_set(&hwc->prev_count, (u64)-left);
+
+		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+	}
+
+	/*
+	 * Due to erratum on certan cpu we need
+	 * a second write to be sure the register
+	 * is updated properly
+	 */
+	if (x86_pmu.perfctr_second_write) {
+		wrmsrl(hwc->event_base,
+			(u64)(-left) & x86_pmu.cntval_mask);
+	}
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+	if (__this_cpu_read(cpu_hw_events.enabled))
+		__x86_pmu_enable_event(&event->hw,
+				       ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc;
+	int assign[X86_PMC_IDX_MAX];
+	int n, n0, ret;
+
+	hwc = &event->hw;
+
+	n0 = cpuc->n_events;
+	ret = n = collect_events(cpuc, event, false);
+	if (ret < 0)
+		goto out;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be performed
+	 * at commit time (->commit_txn) as a whole.
+	 */
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+		goto done_collect;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		goto out;
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+	/*
+	 * Commit the collect_events() state. See x86_pmu_del() and
+	 * x86_pmu_*_txn().
+	 */
+	cpuc->n_events = n;
+	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		x86_perf_event_set_period(event);
+	}
+
+	event->hw.state = 0;
+
+	cpuc->events[idx] = event;
+	__set_bit(idx, cpuc->active_mask);
+	__set_bit(idx, cpuc->running);
+	x86_pmu.enable(event);
+	perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	u64 pebs, debugctl;
+	struct cpu_hw_events *cpuc;
+	unsigned long flags;
+	int cpu, idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.version >= 2) {
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
+	}
+	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+		prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
+	local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+		x86_pmu.disable(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		x86_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int i;
+
+	/*
+	 * event is descheduled
+	 */
+	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 *
+	 * XXX assumes any ->del() called during a TXN will only be on
+	 * an event added during that same TXN.
+	 */
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+		return;
+
+	/*
+	 * Not a TXN, therefore cleanup properly.
+	 */
+	x86_pmu_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		if (event == cpuc->event_list[i])
+			break;
+	}
+
+	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+		return;
+
+	/* If we have a newly added event; make sure to decrease n_added. */
+	if (i >= cpuc->n_events - cpuc->n_added)
+		--cpuc->n_added;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(cpuc, event);
+
+	/* Delete the array entry. */
+	while (++i < cpuc->n_events) {
+		cpuc->event_list[i-1] = cpuc->event_list[i];
+		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+	}
+	--cpuc->n_events;
+
+	perf_event_update_userpage(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	int idx, handled = 0;
+	u64 val;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * Some chipsets need to unmask the LVTPC in a particular spot
+	 * inside the nmi handler.  As a result, the unmasking was pushed
+	 * into all the nmi handlers.
+	 *
+	 * This generic handler doesn't seem to have any issues where the
+	 * unmasking occurs so it was left at the top.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active_mask)) {
+			/*
+			 * Though we deactivated the counter some cpus
+			 * might still deliver spurious interrupts still
+			 * in flight. Catch them:
+			 */
+			if (__test_and_clear_bit(idx, cpuc->running))
+				handled++;
+			continue;
+		}
+
+		event = cpuc->events[idx];
+
+		val = x86_perf_event_update(event);
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled++;
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+	if (!x86_pmu.apic || !x86_pmu_initialized())
+		return;
+
+	/*
+	 * Always use NMI for PMU
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	u64 start_clock;
+	u64 finish_clock;
+	int ret;
+
+	/*
+	 * All PMUs/events that share this PMI handler should make sure to
+	 * increment active_events for their events.
+	 */
+	if (!atomic_read(&active_events))
+		return NMI_DONE;
+
+	start_clock = sched_clock();
+	ret = x86_pmu.handle_irq(regs);
+	finish_clock = sched_clock();
+
+	perf_sample_event_took(finish_clock - start_clock);
+
+	return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int i, ret = NOTIFY_OK;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
+		if (x86_pmu.cpu_prepare)
+			ret = x86_pmu.cpu_prepare(cpu);
+		break;
+
+	case CPU_STARTING:
+		if (x86_pmu.cpu_starting)
+			x86_pmu.cpu_starting(cpu);
+		break;
+
+	case CPU_ONLINE:
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
+		break;
+
+	case CPU_DYING:
+		if (x86_pmu.cpu_dying)
+			x86_pmu.cpu_dying(cpu);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		if (x86_pmu.cpu_dead)
+			x86_pmu.cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void __init pmu_check_apic(void)
+{
+	if (cpu_has_apic)
+		return;
+
+	x86_pmu.apic = 0;
+	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+	pr_info("no hardware sampling interrupt available.\n");
+
+	/*
+	 * If we have a PMU initialized but no APIC
+	 * interrupts, we cannot sample hardware
+	 * events (user-space has to fall back and
+	 * sample via a hrtimer based software event):
+	 */
+	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+	.name = "format",
+	.attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+	struct device_attribute *d;
+	struct perf_pmu_events_attr *pmu_attr;
+	int offset = 0;
+	int i, j;
+
+	for (i = 0; attrs[i]; i++) {
+		d = (struct device_attribute *)attrs[i];
+		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+		/* str trumps id */
+		if (pmu_attr->event_str)
+			continue;
+		if (x86_pmu.event_map(i + offset))
+			continue;
+
+		for (j = i; attrs[j]; j++)
+			attrs[j] = attrs[j + 1];
+
+		/* Check the shifted attr. */
+		i--;
+
+		/*
+		 * event_map() is index based, the attrs array is organized
+		 * by increasing event index. If we shift the events, then
+		 * we need to compensate for the event_map(), otherwise
+		 * we are looking up the wrong event in the map
+		 */
+		offset++;
+	}
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+	struct attribute **new;
+	int j, i;
+
+	for (j = 0; a[j]; j++)
+		;
+	for (i = 0; b[i]; i++)
+		j++;
+	j++;
+
+	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	j = 0;
+	for (i = 0; a[i]; i++)
+		new[j++] = a[i];
+	for (i = 0; b[i]; i++)
+		new[j++] = b[i];
+	new[j] = NULL;
+
+	return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr = \
+		container_of(attr, struct perf_pmu_events_attr, attr);
+	u64 config = x86_pmu.event_map(pmu_attr->id);
+
+	/* string trumps id */
+	if (pmu_attr->event_str)
+		return sprintf(page, "%s", pmu_attr->event_str);
+
+	return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
+EVENT_ATTR(instructions,		INSTRUCTIONS		);
+EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
+EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
+EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
+EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
+EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
+EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
+EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
+EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+	EVENT_PTR(CPU_CYCLES),
+	EVENT_PTR(INSTRUCTIONS),
+	EVENT_PTR(CACHE_REFERENCES),
+	EVENT_PTR(CACHE_MISSES),
+	EVENT_PTR(BRANCH_INSTRUCTIONS),
+	EVENT_PTR(BRANCH_MISSES),
+	EVENT_PTR(BUS_CYCLES),
+	EVENT_PTR(STALLED_CYCLES_FRONTEND),
+	EVENT_PTR(STALLED_CYCLES_BACKEND),
+	EVENT_PTR(REF_CPU_CYCLES),
+	NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+	.name = "events",
+	.attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
+	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
+	ssize_t ret;
+
+	/*
+	* We have whole page size to spend and just little data
+	* to write, so we can safely use sprintf.
+	*/
+	ret = sprintf(page, "event=0x%02llx", event);
+
+	if (umask)
+		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+	if (edge)
+		ret += sprintf(page + ret, ",edge");
+
+	if (pc)
+		ret += sprintf(page + ret, ",pc");
+
+	if (any)
+		ret += sprintf(page + ret, ",any");
+
+	if (inv)
+		ret += sprintf(page + ret, ",inv");
+
+	if (cmask)
+		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+	ret += sprintf(page + ret, "\n");
+
+	return ret;
+}
+
+static int __init init_hw_perf_events(void)
+{
+	struct x86_pmu_quirk *quirk;
+	int err;
+
+	pr_info("Performance Events: ");
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		err = intel_pmu_init();
+		break;
+	case X86_VENDOR_AMD:
+		err = amd_pmu_init();
+		break;
+	default:
+		err = -ENOTSUPP;
+	}
+	if (err != 0) {
+		pr_cont("no PMU driver, software events only.\n");
+		return 0;
+	}
+
+	pmu_check_apic();
+
+	/* sanity check that the hardware exists or is emulated */
+	if (!check_hw_exists())
+		return 0;
+
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+		quirk->func();
+
+	if (!x86_pmu.intel_ctrl)
+		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+	perf_events_lapic_init();
+	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+	unconstrained = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+				   0, x86_pmu.num_counters, 0, 0);
+
+	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+	if (x86_pmu.event_attrs)
+		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+	if (!x86_pmu.events_sysfs_show)
+		x86_pmu_events_group.attrs = &empty_attrs;
+	else
+		filter_events(x86_pmu_events_group.attrs);
+
+	if (x86_pmu.cpu_events) {
+		struct attribute **tmp;
+
+		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+		if (!WARN_ON(!tmp))
+			x86_pmu_events_group.attrs = tmp;
+	}
+
+	pr_info("... version:                %d\n",     x86_pmu.version);
+	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
+	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+
+	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+	perf_cpu_notifier(x86_pmu_notifier);
+
+	return 0;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+	x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
+
+	cpuc->txn_flags = txn_flags;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
+	perf_pmu_disable(pmu);
+	__this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+	unsigned int txn_flags;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	txn_flags = cpuc->txn_flags;
+	cpuc->txn_flags = 0;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
+	/*
+	 * Truncate collected array by the number of events added in this
+	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+	 */
+	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+	perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int assign[X86_PMC_IDX_MAX];
+	int n, ret;
+
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+		cpuc->txn_flags = 0;
+		return 0;
+	}
+
+	n = cpuc->n_events;
+
+	if (!x86_pmu_initialized())
+		return -EAGAIN;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		return ret;
+
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+	cpuc->txn_flags = 0;
+	perf_pmu_enable(pmu);
+	return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+	kfree(cpuc->shared_regs);
+	kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+	struct cpu_hw_events *cpuc;
+	int cpu = raw_smp_processor_id();
+
+	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+	if (!cpuc)
+		return ERR_PTR(-ENOMEM);
+
+	/* only needed, if we have extra_regs */
+	if (x86_pmu.extra_regs) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto error;
+	}
+	cpuc->is_fake = 1;
+	return cpuc;
+error:
+	free_fake_cpuc(cpuc);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+	struct cpu_hw_events *fake_cpuc;
+	struct event_constraint *c;
+	int ret = 0;
+
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
+
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+	if (!c || !c->weight)
+		ret = -EINVAL;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(fake_cpuc, event);
+
+	free_fake_cpuc(fake_cpuc);
+
+	return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ *	- check events are compatible which each other
+ *	- events do not compete for the same counter
+ *	- number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct cpu_hw_events *fake_cpuc;
+	int ret = -EINVAL, n;
+
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
+	/*
+	 * the event is not yet connected with its
+	 * siblings therefore we must first collect
+	 * existing siblings, then add the new event
+	 * before we can simulate the scheduling
+	 */
+	n = collect_events(fake_cpuc, leader, true);
+	if (n < 0)
+		goto out;
+
+	fake_cpuc->n_events = n;
+	n = collect_events(fake_cpuc, event, false);
+	if (n < 0)
+		goto out;
+
+	fake_cpuc->n_events = n;
+
+	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+	free_fake_cpuc(fake_cpuc);
+	return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+	struct pmu *tmp;
+	int err;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __x86_pmu_event_init(event);
+	if (!err) {
+		/*
+		 * we temporarily connect event to its pmu
+		 * such that validate_group() can classify
+		 * it as an x86 event using is_x86_event()
+		 */
+		tmp = event->pmu;
+		event->pmu = &pmu;
+
+		if (event->group_leader != event)
+			err = validate_group(event);
+		else
+			err = validate_event(event);
+
+		event->pmu = tmp;
+	}
+	if (err) {
+		if (event->destroy)
+			event->destroy(event);
+	}
+
+	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+	return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+	if (current->mm)
+		load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+	if (!current->mm)
+		return;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return 0;
+
+	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+		idx -= INTEL_PMC_IDX_FIXED;
+		idx |= 1 << 30;
+	}
+
+	return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	unsigned long val;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val > 2)
+		return -EINVAL;
+
+	if (x86_pmu.attr_rdpmc_broken)
+		return -ENOTSUPP;
+
+	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+		/*
+		 * Changing into or out of always available, aka
+		 * perf-event-bypassing mode.  This path is extremely slow,
+		 * but only root can trigger it, so it's okay.
+		 */
+		if (val == 2)
+			static_key_slow_inc(&rdpmc_always_available);
+		else
+			static_key_slow_dec(&rdpmc_always_available);
+		on_each_cpu(refresh_pce, NULL, 1);
+	}
+
+	x86_pmu.attr_rdpmc = val;
+
+	return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+	&dev_attr_rdpmc.attr,
+	NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+	.attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+	&x86_pmu_attr_group,
+	&x86_pmu_format_group,
+	&x86_pmu_events_group,
+	NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+	if (x86_pmu.check_microcode)
+		x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
+static struct pmu pmu = {
+	.pmu_enable		= x86_pmu_enable,
+	.pmu_disable		= x86_pmu_disable,
+
+	.attr_groups		= x86_pmu_attr_groups,
+
+	.event_init		= x86_pmu_event_init,
+
+	.event_mapped		= x86_pmu_event_mapped,
+	.event_unmapped		= x86_pmu_event_unmapped,
+
+	.add			= x86_pmu_add,
+	.del			= x86_pmu_del,
+	.start			= x86_pmu_start,
+	.stop			= x86_pmu_stop,
+	.read			= x86_pmu_read,
+
+	.start_txn		= x86_pmu_start_txn,
+	.cancel_txn		= x86_pmu_cancel_txn,
+	.commit_txn		= x86_pmu_commit_txn,
+
+	.event_idx		= x86_pmu_event_idx,
+	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
+{
+	struct cyc2ns_data *data;
+
+	userpg->cap_user_time = 0;
+	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_rdpmc =
+		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+	userpg->pmc_width = x86_pmu.cntval_bits;
+
+	if (!sched_clock_stable())
+		return;
+
+	data = cyc2ns_read_begin();
+
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
+	userpg->cap_user_time = 1;
+	userpg->time_mult = data->cyc2ns_mul;
+	userpg->time_shift = data->cyc2ns_shift;
+	userpg->time_offset = data->cyc2ns_offset - now;
+
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
+
+	cyc2ns_read_end(data);
+}
+
+/*
+ * callchain support
+ */
+
+static int backtrace_stack(void *data, char *name)
+{
+	return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	perf_callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+	.walk_stack		= print_context_stack_bp,
+};
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
+
+	perf_callchain_store(entry, regs->ip);
+
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+	struct desc_struct *desc;
+	int idx = segment >> 3;
+
+	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+		struct ldt_struct *ldt;
+
+		if (idx > LDT_ENTRIES)
+			return 0;
+
+		/* IRQs are off, so this synchronizes with smp_store_release */
+		ldt = lockless_dereference(current->active_mm->context.ldt);
+		if (!ldt || idx > ldt->size)
+			return 0;
+
+		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
+	} else {
+		if (idx > GDT_ENTRIES)
+			return 0;
+
+		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+	}
+
+	return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	/* 32-bit process in 64-bit kernel. */
+	unsigned long ss_base, cs_base;
+	struct stack_frame_ia32 frame;
+	const void __user *fp;
+
+	if (!test_thread_flag(TIF_IA32))
+		return 0;
+
+	cs_base = get_segment_base(regs->cs);
+	ss_base = get_segment_base(regs->ss);
+
+	fp = compat_ptr(ss_base + regs->bp);
+	pagefault_disable();
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		unsigned long bytes;
+		frame.next_frame     = 0;
+		frame.return_address = 0;
+
+		if (!access_ok(VERIFY_READ, fp, 8))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+		if (bytes != 0)
+			break;
+
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
+		perf_callchain_store(entry, cs_base + frame.return_address);
+		fp = compat_ptr(ss_base + frame.next_frame);
+	}
+	pagefault_enable();
+	return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+    return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
+
+	/*
+	 * We don't know what to do with VM86 stacks.. ignore them for now.
+	 */
+	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+		return;
+
+	fp = (void __user *)regs->bp;
+
+	perf_callchain_store(entry, regs->ip);
+
+	if (!current->mm)
+		return;
+
+	if (perf_callchain_user32(regs, entry))
+		return;
+
+	pagefault_disable();
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		unsigned long bytes;
+		frame.next_frame	     = NULL;
+		frame.return_address = 0;
+
+		if (!access_ok(VERIFY_READ, fp, 16))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
+		if (bytes != 0)
+			break;
+
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
+		perf_callchain_store(entry, frame.return_address);
+		fp = (void __user *)frame.next_frame;
+	}
+	pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+
+#ifdef CONFIG_X86_32
+	/*
+	 * If we are in VM86 mode, add the segment offset to convert to a
+	 * linear address.
+	 */
+	if (regs->flags & X86_VM_MASK)
+		return 0x10 * regs->cs;
+
+	if (user_mode(regs) && regs->cs != __USER_CS)
+		return get_segment_base(regs->cs);
+#else
+	if (user_mode(regs) && !user_64bit_mode(regs) &&
+	    regs->cs != __USER32_CS)
+		return get_segment_base(regs->cs);
+#endif
+	return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		return perf_guest_cbs->get_guest_ip();
+
+	return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT_IP;
+
+	return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+	cap->version		= x86_pmu.version;
+	cap->num_counters_gp	= x86_pmu.num_counters;
+	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
+	cap->bit_width_gp	= x86_pmu.cntval_bits;
+	cap->bit_width_fixed	= x86_pmu.cntval_bits;
+	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
+	cap->events_mask_len	= x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 58031303e304..77000d54fcd1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,8 +30,6 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
-
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o perf_event_amd_uncore.o
 ifdef CONFIG_AMD_IOMMU
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
deleted file mode 100644
index 7402c8182813..000000000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,2429 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
-	.enabled = 1,
-};
-
-struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
-
-u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - x86_pmu.cntval_bits;
-	u64 prev_raw_count, new_raw_count;
-	int idx = hwc->idx;
-	s64 delta;
-
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * Careful: an NMI might modify the previous event value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic event atomically:
-	 */
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		goto again;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-	local64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-	struct extra_reg *er;
-
-	reg = &event->hw.extra_reg;
-
-	if (!x86_pmu.extra_regs)
-		return 0;
-
-	for (er = x86_pmu.extra_regs; er->msr; er++) {
-		if (er->event != (config & er->config_mask))
-			continue;
-		if (event->attr.config1 & ~er->valid_mask)
-			return -EINVAL;
-		/* Check if the extra msrs can be safely accessed*/
-		if (!er->extra_msr_access)
-			return -ENXIO;
-
-		reg->idx = er->idx;
-		reg->config = event->attr.config1;
-		reg->reg = er->msr;
-		break;
-	}
-	return 0;
-}
-
-static atomic_t active_events;
-static atomic_t pmc_refcount;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
-			goto perfctr_fail;
-	}
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
-			goto eventsel_fail;
-	}
-
-	return true;
-
-eventsel_fail:
-	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu_config_addr(i));
-
-	i = x86_pmu.num_counters;
-
-perfctr_fail:
-	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu_event_addr(i));
-
-	return false;
-}
-
-static void release_pmc_hardware(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		release_perfctr_nmi(x86_pmu_event_addr(i));
-		release_evntsel_nmi(x86_pmu_config_addr(i));
-	}
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
-	u64 val, val_fail, val_new= ~0;
-	int i, reg, reg_fail, ret = 0;
-	int bios_fail = 0;
-	int reg_safe = -1;
-
-	/*
-	 * Check to see if the BIOS enabled any of the counters, if so
-	 * complain and bail.
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		reg = x86_pmu_config_addr(i);
-		ret = rdmsrl_safe(reg, &val);
-		if (ret)
-			goto msr_fail;
-		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
-			bios_fail = 1;
-			val_fail = val;
-			reg_fail = reg;
-		} else {
-			reg_safe = i;
-		}
-	}
-
-	if (x86_pmu.num_counters_fixed) {
-		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		ret = rdmsrl_safe(reg, &val);
-		if (ret)
-			goto msr_fail;
-		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-			if (val & (0x03 << i*4)) {
-				bios_fail = 1;
-				val_fail = val;
-				reg_fail = reg;
-			}
-		}
-	}
-
-	/*
-	 * If all the counters are enabled, the below test will always
-	 * fail.  The tools will also become useless in this scenario.
-	 * Just fail and disable the hardware counters.
-	 */
-
-	if (reg_safe == -1) {
-		reg = reg_safe;
-		goto msr_fail;
-	}
-
-	/*
-	 * Read the current value, change it and read it back to see if it
-	 * matches, this is needed to detect certain hardware emulators
-	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-	 */
-	reg = x86_pmu_event_addr(reg_safe);
-	if (rdmsrl_safe(reg, &val))
-		goto msr_fail;
-	val ^= 0xffffUL;
-	ret = wrmsrl_safe(reg, val);
-	ret |= rdmsrl_safe(reg, &val_new);
-	if (ret || val != val_new)
-		goto msr_fail;
-
-	/*
-	 * We still allow the PMU driver to operate:
-	 */
-	if (bios_fail) {
-		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
-		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
-			      reg_fail, val_fail);
-	}
-
-	return true;
-
-msr_fail:
-	pr_cont("Broken PMU hardware detected, using software events only.\n");
-	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
-		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
-		reg, val_new);
-
-	return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
-	x86_release_hardware();
-	atomic_dec(&active_events);
-}
-
-void hw_perf_lbr_event_destroy(struct perf_event *event)
-{
-	hw_perf_event_destroy(event);
-
-	/* undo the lbr/bts event accounting */
-	x86_del_exclusive(x86_lbr_exclusive_lbr);
-}
-
-static inline int x86_pmu_initialized(void)
-{
-	return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	unsigned int cache_type, cache_op, cache_result;
-	u64 config, val;
-
-	config = attr->config;
-
-	cache_type = (config >>  0) & 0xff;
-	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-		return -EINVAL;
-
-	cache_op = (config >>  8) & 0xff;
-	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-		return -EINVAL;
-
-	cache_result = (config >> 16) & 0xff;
-	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-	if (val == 0)
-		return -ENOENT;
-
-	if (val == -1)
-		return -EINVAL;
-
-	hwc->config |= val;
-	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-	return x86_pmu_extra_regs(val, event);
-}
-
-int x86_reserve_hardware(void)
-{
-	int err = 0;
-
-	if (!atomic_inc_not_zero(&pmc_refcount)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&pmc_refcount) == 0) {
-			if (!reserve_pmc_hardware())
-				err = -EBUSY;
-			else
-				reserve_ds_buffers();
-		}
-		if (!err)
-			atomic_inc(&pmc_refcount);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-
-	return err;
-}
-
-void x86_release_hardware(void)
-{
-	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
-		release_pmc_hardware();
-		release_ds_buffers();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-/*
- * Check if we can create event of a certain type (that no conflicting events
- * are present).
- */
-int x86_add_exclusive(unsigned int what)
-{
-	int i;
-
-	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
-		mutex_lock(&pmc_reserve_mutex);
-		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-				goto fail_unlock;
-		}
-		atomic_inc(&x86_pmu.lbr_exclusive[what]);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-
-	atomic_inc(&active_events);
-	return 0;
-
-fail_unlock:
-	mutex_unlock(&pmc_reserve_mutex);
-	return -EBUSY;
-}
-
-void x86_del_exclusive(unsigned int what)
-{
-	atomic_dec(&x86_pmu.lbr_exclusive[what]);
-	atomic_dec(&active_events);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	if (!is_sampling_event(event)) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		local64_set(&hwc->period_left, hwc->sample_period);
-	}
-
-	if (attr->type == PERF_TYPE_RAW)
-		return x86_pmu_extra_regs(event->attr.config, event);
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, event);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-	    !attr->freq && hwc->sample_period == 1) {
-		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts_active)
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (!attr->exclude_kernel)
-			return -EOPNOTSUPP;
-
-		/* disallow bts if conflicting events are present */
-		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-			return -EBUSY;
-
-		event->destroy = hw_perf_lbr_event_destroy;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
-	u64 m = event->attr.branch_sample_type;
-	u64 b = 0;
-
-	/* must capture all branches */
-	if (!(m & PERF_SAMPLE_BRANCH_ANY))
-		return 0;
-
-	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
-	if (!event->attr.exclude_user)
-		b |= PERF_SAMPLE_BRANCH_USER;
-
-	if (!event->attr.exclude_kernel)
-		b |= PERF_SAMPLE_BRANCH_KERNEL;
-
-	/*
-	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
-	 */
-
-	return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
-	if (event->attr.precise_ip) {
-		int precise = 0;
-
-		/* Support for constant skid */
-		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
-			precise++;
-
-			/* Support for IP fixup */
-			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-				precise++;
-
-			if (x86_pmu.pebs_prec_dist)
-				precise++;
-		}
-
-		if (event->attr.precise_ip > precise)
-			return -EOPNOTSUPP;
-	}
-	/*
-	 * check that PEBS LBR correction does not conflict with
-	 * whatever the user is asking with attr->branch_sample_type
-	 */
-	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
-		u64 *br_type = &event->attr.branch_sample_type;
-
-		if (has_branch_stack(event)) {
-			if (!precise_br_compat(event))
-				return -EOPNOTSUPP;
-
-			/* branch_sample_type is compatible */
-
-		} else {
-			/*
-			 * user did not specify  branch_sample_type
-			 *
-			 * For PEBS fixups, we capture all
-			 * the branches at the priv level of the
-			 * event.
-			 */
-			*br_type = PERF_SAMPLE_BRANCH_ANY;
-
-			if (!event->attr.exclude_user)
-				*br_type |= PERF_SAMPLE_BRANCH_USER;
-
-			if (!event->attr.exclude_kernel)
-				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-		}
-	}
-
-	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
-		event->attach_state |= PERF_ATTACH_TASK_DATA;
-
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
-	/*
-	 * Count user and OS events unless requested not to
-	 */
-	if (!event->attr.exclude_user)
-		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!event->attr.exclude_kernel)
-		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
-	if (event->attr.type == PERF_TYPE_RAW)
-		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
-	if (event->attr.sample_period && x86_pmu.limit_period) {
-		if (x86_pmu.limit_period(event, event->attr.sample_period) >
-				event->attr.sample_period)
-			return -EINVAL;
-	}
-
-	return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
-	int err;
-
-	if (!x86_pmu_initialized())
-		return -ENODEV;
-
-	err = x86_reserve_hardware();
-	if (err)
-		return err;
-
-	atomic_inc(&active_events);
-	event->destroy = hw_perf_event_destroy;
-
-	event->hw.idx = -1;
-	event->hw.last_cpu = -1;
-	event->hw.last_tag = ~0ULL;
-
-	/* mark unused */
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-	return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		rdmsrl(x86_pmu_config_addr(idx), val);
-		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
-			continue;
-		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		wrmsrl(x86_pmu_config_addr(idx), val);
-	}
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (!x86_pmu_initialized())
-		return;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->n_added = 0;
-	cpuc->enabled = 0;
-	barrier();
-
-	x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-	}
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
-	return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
-	int	weight;
-	int	event;		/* event index */
-	int	counter;	/* counter index */
-	int	unassigned;	/* number of events to be assigned left */
-	int	nr_gp;		/* number of GP counters used */
-	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define	SCHED_STATES_MAX	2
-
-struct perf_sched {
-	int			max_weight;
-	int			max_events;
-	int			max_gp;
-	int			saved_states;
-	struct event_constraint	**constraints;
-	struct sched_state	state;
-	struct sched_state	saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-			    int num, int wmin, int wmax, int gpmax)
-{
-	int idx;
-
-	memset(sched, 0, sizeof(*sched));
-	sched->max_events	= num;
-	sched->max_weight	= wmax;
-	sched->max_gp		= gpmax;
-	sched->constraints	= constraints;
-
-	for (idx = 0; idx < num; idx++) {
-		if (constraints[idx]->weight == wmin)
-			break;
-	}
-
-	sched->state.event	= idx;		/* start with min weight */
-	sched->state.weight	= wmin;
-	sched->state.unassigned	= num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
-	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
-		return;
-
-	sched->saved[sched->saved_states] = sched->state;
-	sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
-	if (!sched->saved_states)
-		return false;
-
-	sched->saved_states--;
-	sched->state = sched->saved[sched->saved_states];
-
-	/* continue with next counter: */
-	clear_bit(sched->state.counter++, sched->state.used);
-
-	return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
-	struct event_constraint *c;
-	int idx;
-
-	if (!sched->state.unassigned)
-		return false;
-
-	if (sched->state.event >= sched->max_events)
-		return false;
-
-	c = sched->constraints[sched->state.event];
-	/* Prefer fixed purpose counters */
-	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
-		idx = INTEL_PMC_IDX_FIXED;
-		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
-			if (!__test_and_set_bit(idx, sched->state.used))
-				goto done;
-		}
-	}
-
-	/* Grab the first unused counter starting with idx */
-	idx = sched->state.counter;
-	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-		if (!__test_and_set_bit(idx, sched->state.used)) {
-			if (sched->state.nr_gp++ >= sched->max_gp)
-				return false;
-
-			goto done;
-		}
-	}
-
-	return false;
-
-done:
-	sched->state.counter = idx;
-
-	if (c->overlap)
-		perf_sched_save_state(sched);
-
-	return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
-	while (!__perf_sched_find_counter(sched)) {
-		if (!perf_sched_restore_state(sched))
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
-	struct event_constraint *c;
-
-	if (!sched->state.unassigned || !--sched->state.unassigned)
-		return false;
-
-	do {
-		/* next event */
-		sched->state.event++;
-		if (sched->state.event >= sched->max_events) {
-			/* next weight */
-			sched->state.event = 0;
-			sched->state.weight++;
-			if (sched->state.weight > sched->max_weight)
-				return false;
-		}
-		c = sched->constraints[sched->state.event];
-	} while (c->weight != sched->state.weight);
-
-	sched->state.counter = 0;	/* start with first counter */
-
-	return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign)
-{
-	struct perf_sched sched;
-
-	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
-
-	do {
-		if (!perf_sched_find_counter(&sched))
-			break;	/* failed */
-		if (assign)
-			assign[sched.state.event] = sched.state.counter;
-	} while (perf_sched_next_event(&sched));
-
-	return sched.state.unassigned;
-}
-EXPORT_SYMBOL_GPL(perf_assign_events);
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-	struct event_constraint *c;
-	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	struct perf_event *e;
-	int i, wmin, wmax, unsched = 0;
-	struct hw_perf_event *hwc;
-
-	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
-	if (x86_pmu.start_scheduling)
-		x86_pmu.start_scheduling(cpuc);
-
-	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		cpuc->event_constraint[i] = NULL;
-		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-		cpuc->event_constraint[i] = c;
-
-		wmin = min(wmin, c->weight);
-		wmax = max(wmax, c->weight);
-	}
-
-	/*
-	 * fastpath, try to reuse previous register
-	 */
-	for (i = 0; i < n; i++) {
-		hwc = &cpuc->event_list[i]->hw;
-		c = cpuc->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		/* not already used */
-		if (test_bit(hwc->idx, used_mask))
-			break;
-
-		__set_bit(hwc->idx, used_mask);
-		if (assign)
-			assign[i] = hwc->idx;
-	}
-
-	/* slow path */
-	if (i != n) {
-		int gpmax = x86_pmu.num_counters;
-
-		/*
-		 * Do not allow scheduling of more than half the available
-		 * generic counters.
-		 *
-		 * This helps avoid counter starvation of sibling thread by
-		 * ensuring at most half the counters cannot be in exclusive
-		 * mode. There is no designated counters for the limits. Any
-		 * N/2 counters can be used. This helps with events with
-		 * specific counter constraints.
-		 */
-		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-			gpmax /= 2;
-
-		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-					     wmax, gpmax, assign);
-	}
-
-	/*
-	 * In case of success (unsched = 0), mark events as committed,
-	 * so we do not put_constraint() in case new events are added
-	 * and fail to be scheduled
-	 *
-	 * We invoke the lower level commit callback to lock the resource
-	 *
-	 * We do not need to do all of this in case we are called to
-	 * validate an event group (assign == NULL)
-	 */
-	if (!unsched && assign) {
-		for (i = 0; i < n; i++) {
-			e = cpuc->event_list[i];
-			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
-			if (x86_pmu.commit_scheduling)
-				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
-		}
-	} else {
-		for (i = 0; i < n; i++) {
-			e = cpuc->event_list[i];
-			/*
-			 * do not put_constraint() on comitted events,
-			 * because they are good to go
-			 */
-			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-				continue;
-
-			/*
-			 * release events that failed scheduling
-			 */
-			if (x86_pmu.put_event_constraints)
-				x86_pmu.put_event_constraints(cpuc, e);
-		}
-	}
-
-	if (x86_pmu.stop_scheduling)
-		x86_pmu.stop_scheduling(cpuc);
-
-	return unsched ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
-	struct perf_event *event;
-	int n, max_count;
-
-	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
-	/* current number of events already accepted */
-	n = cpuc->n_events;
-
-	if (is_x86_event(leader)) {
-		if (n >= max_count)
-			return -EINVAL;
-		cpuc->event_list[n] = leader;
-		n++;
-	}
-	if (!dogrp)
-		return n;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
-		if (!is_x86_event(event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-
-		if (n >= max_count)
-			return -EINVAL;
-
-		cpuc->event_list[n] = event;
-		n++;
-	}
-	return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
-				struct cpu_hw_events *cpuc, int i)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	hwc->idx = cpuc->assign[i];
-	hwc->last_cpu = smp_processor_id();
-	hwc->last_tag = ++cpuc->tags[i];
-
-	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
-		hwc->config_base = 0;
-		hwc->event_base	= 0;
-	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
-		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-	} else {
-		hwc->config_base = x86_pmu_config_addr(hwc->idx);
-		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
-	}
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
-					struct cpu_hw_events *cpuc,
-					int i)
-{
-	return hwc->idx == cpuc->assign[i] &&
-		hwc->last_cpu == smp_processor_id() &&
-		hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_event *event;
-	struct hw_perf_event *hwc;
-	int i, added = cpuc->n_added;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	if (cpuc->enabled)
-		return;
-
-	if (cpuc->n_added) {
-		int n_running = cpuc->n_events - cpuc->n_added;
-		/*
-		 * apply assignment obtained either from
-		 * hw_perf_group_sched_in() or x86_pmu_enable()
-		 *
-		 * step1: save events moving to new counters
-		 */
-		for (i = 0; i < n_running; i++) {
-			event = cpuc->event_list[i];
-			hwc = &event->hw;
-
-			/*
-			 * we can avoid reprogramming counter if:
-			 * - assigned same counter as last time
-			 * - running on same CPU as last time
-			 * - no other event has used the counter since
-			 */
-			if (hwc->idx == -1 ||
-			    match_prev_assignment(hwc, cpuc, i))
-				continue;
-
-			/*
-			 * Ensure we don't accidentally enable a stopped
-			 * counter simply because we rescheduled.
-			 */
-			if (hwc->state & PERF_HES_STOPPED)
-				hwc->state |= PERF_HES_ARCH;
-
-			x86_pmu_stop(event, PERF_EF_UPDATE);
-		}
-
-		/*
-		 * step2: reprogram moved events into new counters
-		 */
-		for (i = 0; i < cpuc->n_events; i++) {
-			event = cpuc->event_list[i];
-			hwc = &event->hw;
-
-			if (!match_prev_assignment(hwc, cpuc, i))
-				x86_assign_hw_event(event, cpuc, i);
-			else if (i < n_running)
-				continue;
-
-			if (hwc->state & PERF_HES_ARCH)
-				continue;
-
-			x86_pmu_start(event, PERF_EF_RELOAD);
-		}
-		cpuc->n_added = 0;
-		perf_events_lapic_init();
-	}
-
-	cpuc->enabled = 1;
-	barrier();
-
-	x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	s64 left = local64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int ret = 0, idx = hwc->idx;
-
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * If we are way outside a reasonable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	/*
-	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-	 */
-	if (unlikely(left < 2))
-		left = 2;
-
-	if (left > x86_pmu.max_period)
-		left = x86_pmu.max_period;
-
-	if (x86_pmu.limit_period)
-		left = x86_pmu.limit_period(event, left);
-
-	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
-	    local64_read(&hwc->prev_count) != (u64)-left) {
-		/*
-		 * The hw event starts counting from this event offset,
-		 * mark it to be able to extra future deltas:
-		 */
-		local64_set(&hwc->prev_count, (u64)-left);
-
-		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-	}
-
-	/*
-	 * Due to erratum on certan cpu we need
-	 * a second write to be sure the register
-	 * is updated properly
-	 */
-	if (x86_pmu.perfctr_second_write) {
-		wrmsrl(hwc->event_base,
-			(u64)(-left) & x86_pmu.cntval_mask);
-	}
-
-	perf_event_update_userpage(event);
-
-	return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
-	if (__this_cpu_read(cpu_hw_events.enabled))
-		__x86_pmu_enable_event(&event->hw,
-				       ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc;
-	int assign[X86_PMC_IDX_MAX];
-	int n, n0, ret;
-
-	hwc = &event->hw;
-
-	n0 = cpuc->n_events;
-	ret = n = collect_events(cpuc, event, false);
-	if (ret < 0)
-		goto out;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	/*
-	 * If group events scheduling transaction was started,
-	 * skip the schedulability test here, it will be performed
-	 * at commit time (->commit_txn) as a whole.
-	 */
-	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-		goto done_collect;
-
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
-	if (ret)
-		goto out;
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
-	/*
-	 * Commit the collect_events() state. See x86_pmu_del() and
-	 * x86_pmu_*_txn().
-	 */
-	cpuc->n_events = n;
-	cpuc->n_added += n - n0;
-	cpuc->n_txn += n - n0;
-
-	ret = 0;
-out:
-	return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx = event->hw.idx;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	if (WARN_ON_ONCE(idx == -1))
-		return;
-
-	if (flags & PERF_EF_RELOAD) {
-		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-		x86_perf_event_set_period(event);
-	}
-
-	event->hw.state = 0;
-
-	cpuc->events[idx] = event;
-	__set_bit(idx, cpuc->active_mask);
-	__set_bit(idx, cpuc->running);
-	x86_pmu.enable(event);
-	perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs, debugctl;
-	struct cpu_hw_events *cpuc;
-	unsigned long flags;
-	int cpu, idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	if (x86_pmu.version >= 2) {
-		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-		pr_info("\n");
-		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		if (x86_pmu.pebs_constraints) {
-			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
-		}
-		if (x86_pmu.lbr_nr) {
-			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-		}
-	}
-	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
-		prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-			cpu, idx, pmc_ctrl);
-		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-			cpu, idx, prev_left);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-	}
-	local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
-		x86_pmu.disable(event);
-		cpuc->events[hwc->idx] = NULL;
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		x86_perf_event_update(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
-
-	/*
-	 * event is descheduled
-	 */
-	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
-	/*
-	 * If we're called during a txn, we don't need to do anything.
-	 * The events never got scheduled and ->cancel_txn will truncate
-	 * the event_list.
-	 *
-	 * XXX assumes any ->del() called during a TXN will only be on
-	 * an event added during that same TXN.
-	 */
-	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-		return;
-
-	/*
-	 * Not a TXN, therefore cleanup properly.
-	 */
-	x86_pmu_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < cpuc->n_events; i++) {
-		if (event == cpuc->event_list[i])
-			break;
-	}
-
-	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
-		return;
-
-	/* If we have a newly added event; make sure to decrease n_added. */
-	if (i >= cpuc->n_events - cpuc->n_added)
-		--cpuc->n_added;
-
-	if (x86_pmu.put_event_constraints)
-		x86_pmu.put_event_constraints(cpuc, event);
-
-	/* Delete the array entry. */
-	while (++i < cpuc->n_events) {
-		cpuc->event_list[i-1] = cpuc->event_list[i];
-		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
-	}
-	--cpuc->n_events;
-
-	perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	struct perf_event *event;
-	int idx, handled = 0;
-	u64 val;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * Some chipsets need to unmask the LVTPC in a particular spot
-	 * inside the nmi handler.  As a result, the unmasking was pushed
-	 * into all the nmi handlers.
-	 *
-	 * This generic handler doesn't seem to have any issues where the
-	 * unmasking occurs so it was left at the top.
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask)) {
-			/*
-			 * Though we deactivated the counter some cpus
-			 * might still deliver spurious interrupts still
-			 * in flight. Catch them:
-			 */
-			if (__test_and_clear_bit(idx, cpuc->running))
-				handled++;
-			continue;
-		}
-
-		event = cpuc->events[idx];
-
-		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-			continue;
-
-		/*
-		 * event overflow
-		 */
-		handled++;
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (!x86_perf_event_set_period(event))
-			continue;
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-void perf_events_lapic_init(void)
-{
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	/*
-	 * Always use NMI for PMU
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-	u64 start_clock;
-	u64 finish_clock;
-	int ret;
-
-	/*
-	 * All PMUs/events that share this PMI handler should make sure to
-	 * increment active_events for their events.
-	 */
-	if (!atomic_read(&active_events))
-		return NMI_DONE;
-
-	start_clock = sched_clock();
-	ret = x86_pmu.handle_irq(regs);
-	finish_clock = sched_clock();
-
-	perf_sample_event_took(finish_clock - start_clock);
-
-	return ret;
-}
-NOKPROBE_SYMBOL(perf_event_nmi_handler);
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int i, ret = NOTIFY_OK;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
-			cpuc->kfree_on_online[i] = NULL;
-		if (x86_pmu.cpu_prepare)
-			ret = x86_pmu.cpu_prepare(cpu);
-		break;
-
-	case CPU_STARTING:
-		if (x86_pmu.cpu_starting)
-			x86_pmu.cpu_starting(cpu);
-		break;
-
-	case CPU_ONLINE:
-		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
-			kfree(cpuc->kfree_on_online[i]);
-			cpuc->kfree_on_online[i] = NULL;
-		}
-		break;
-
-	case CPU_DYING:
-		if (x86_pmu.cpu_dying)
-			x86_pmu.cpu_dying(cpu);
-		break;
-
-	case CPU_UP_CANCELED:
-	case CPU_DEAD:
-		if (x86_pmu.cpu_dead)
-			x86_pmu.cpu_dead(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
-	if (cpu_has_apic)
-		return;
-
-	x86_pmu.apic = 0;
-	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-	pr_info("no hardware sampling interrupt available.\n");
-
-	/*
-	 * If we have a PMU initialized but no APIC
-	 * interrupts, we cannot sample hardware
-	 * events (user-space has to fall back and
-	 * sample via a hrtimer based software event):
-	 */
-	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-}
-
-static struct attribute_group x86_pmu_format_group = {
-	.name = "format",
-	.attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
-	struct device_attribute *d;
-	struct perf_pmu_events_attr *pmu_attr;
-	int offset = 0;
-	int i, j;
-
-	for (i = 0; attrs[i]; i++) {
-		d = (struct device_attribute *)attrs[i];
-		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
-		/* str trumps id */
-		if (pmu_attr->event_str)
-			continue;
-		if (x86_pmu.event_map(i + offset))
-			continue;
-
-		for (j = i; attrs[j]; j++)
-			attrs[j] = attrs[j + 1];
-
-		/* Check the shifted attr. */
-		i--;
-
-		/*
-		 * event_map() is index based, the attrs array is organized
-		 * by increasing event index. If we shift the events, then
-		 * we need to compensate for the event_map(), otherwise
-		 * we are looking up the wrong event in the map
-		 */
-		offset++;
-	}
-}
-
-/* Merge two pointer arrays */
-__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
-	struct attribute **new;
-	int j, i;
-
-	for (j = 0; a[j]; j++)
-		;
-	for (i = 0; b[i]; i++)
-		j++;
-	j++;
-
-	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
-	if (!new)
-		return NULL;
-
-	j = 0;
-	for (i = 0; a[i]; i++)
-		new[j++] = a[i];
-	for (i = 0; b[i]; i++)
-		new[j++] = b[i];
-	new[j] = NULL;
-
-	return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-			  char *page)
-{
-	struct perf_pmu_events_attr *pmu_attr = \
-		container_of(attr, struct perf_pmu_events_attr, attr);
-	u64 config = x86_pmu.event_map(pmu_attr->id);
-
-	/* string trumps id */
-	if (pmu_attr->event_str)
-		return sprintf(page, "%s", pmu_attr->event_str);
-
-	return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
-EVENT_ATTR(instructions,		INSTRUCTIONS		);
-EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
-EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
-EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
-EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
-EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
-EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
-EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
-EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
-	EVENT_PTR(CPU_CYCLES),
-	EVENT_PTR(INSTRUCTIONS),
-	EVENT_PTR(CACHE_REFERENCES),
-	EVENT_PTR(CACHE_MISSES),
-	EVENT_PTR(BRANCH_INSTRUCTIONS),
-	EVENT_PTR(BRANCH_MISSES),
-	EVENT_PTR(BUS_CYCLES),
-	EVENT_PTR(STALLED_CYCLES_FRONTEND),
-	EVENT_PTR(STALLED_CYCLES_BACKEND),
-	EVENT_PTR(REF_CPU_CYCLES),
-	NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
-	.name = "events",
-	.attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
-	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
-	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
-	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
-	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
-	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
-	ssize_t ret;
-
-	/*
-	* We have whole page size to spend and just little data
-	* to write, so we can safely use sprintf.
-	*/
-	ret = sprintf(page, "event=0x%02llx", event);
-
-	if (umask)
-		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
-	if (edge)
-		ret += sprintf(page + ret, ",edge");
-
-	if (pc)
-		ret += sprintf(page + ret, ",pc");
-
-	if (any)
-		ret += sprintf(page + ret, ",any");
-
-	if (inv)
-		ret += sprintf(page + ret, ",inv");
-
-	if (cmask)
-		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
-	ret += sprintf(page + ret, "\n");
-
-	return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
-	struct x86_pmu_quirk *quirk;
-	int err;
-
-	pr_info("Performance Events: ");
-
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_INTEL:
-		err = intel_pmu_init();
-		break;
-	case X86_VENDOR_AMD:
-		err = amd_pmu_init();
-		break;
-	default:
-		err = -ENOTSUPP;
-	}
-	if (err != 0) {
-		pr_cont("no PMU driver, software events only.\n");
-		return 0;
-	}
-
-	pmu_check_apic();
-
-	/* sanity check that the hardware exists or is emulated */
-	if (!check_hw_exists())
-		return 0;
-
-	pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
-
-	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
-		quirk->func();
-
-	if (!x86_pmu.intel_ctrl)
-		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-	perf_events_lapic_init();
-	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
-	unconstrained = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-				   0, x86_pmu.num_counters, 0, 0);
-
-	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
-	if (x86_pmu.event_attrs)
-		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
-	if (!x86_pmu.events_sysfs_show)
-		x86_pmu_events_group.attrs = &empty_attrs;
-	else
-		filter_events(x86_pmu_events_group.attrs);
-
-	if (x86_pmu.cpu_events) {
-		struct attribute **tmp;
-
-		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
-		if (!WARN_ON(!tmp))
-			x86_pmu_events_group.attrs = tmp;
-	}
-
-	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
-
-	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
-	perf_cpu_notifier(x86_pmu_notifier);
-
-	return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
-	x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
- */
-static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
-
-	cpuc->txn_flags = txn_flags;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
-		return;
-
-	perf_pmu_disable(pmu);
-	__this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
-	unsigned int txn_flags;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
-
-	txn_flags = cpuc->txn_flags;
-	cpuc->txn_flags = 0;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
-		return;
-
-	/*
-	 * Truncate collected array by the number of events added in this
-	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
-	 */
-	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
-	perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- *
- * Does not cancel the transaction on failure; expects the caller to do this.
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int assign[X86_PMC_IDX_MAX];
-	int n, ret;
-
-	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
-
-	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
-		cpuc->txn_flags = 0;
-		return 0;
-	}
-
-	n = cpuc->n_events;
-
-	if (!x86_pmu_initialized())
-		return -EAGAIN;
-
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
-	if (ret)
-		return ret;
-
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n*sizeof(int));
-
-	cpuc->txn_flags = 0;
-	perf_pmu_enable(pmu);
-	return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
-	kfree(cpuc->shared_regs);
-	kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
-	struct cpu_hw_events *cpuc;
-	int cpu = raw_smp_processor_id();
-
-	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
-	if (!cpuc)
-		return ERR_PTR(-ENOMEM);
-
-	/* only needed, if we have extra_regs */
-	if (x86_pmu.extra_regs) {
-		cpuc->shared_regs = allocate_shared_regs(cpu);
-		if (!cpuc->shared_regs)
-			goto error;
-	}
-	cpuc->is_fake = 1;
-	return cpuc;
-error:
-	free_fake_cpuc(cpuc);
-	return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
-	struct cpu_hw_events *fake_cpuc;
-	struct event_constraint *c;
-	int ret = 0;
-
-	fake_cpuc = allocate_fake_cpuc();
-	if (IS_ERR(fake_cpuc))
-		return PTR_ERR(fake_cpuc);
-
-	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
-
-	if (!c || !c->weight)
-		ret = -EINVAL;
-
-	if (x86_pmu.put_event_constraints)
-		x86_pmu.put_event_constraints(fake_cpuc, event);
-
-	free_fake_cpuc(fake_cpuc);
-
-	return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- *	- check events are compatible which each other
- *	- events do not compete for the same counter
- *	- number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
-	struct perf_event *leader = event->group_leader;
-	struct cpu_hw_events *fake_cpuc;
-	int ret = -EINVAL, n;
-
-	fake_cpuc = allocate_fake_cpuc();
-	if (IS_ERR(fake_cpuc))
-		return PTR_ERR(fake_cpuc);
-	/*
-	 * the event is not yet connected with its
-	 * siblings therefore we must first collect
-	 * existing siblings, then add the new event
-	 * before we can simulate the scheduling
-	 */
-	n = collect_events(fake_cpuc, leader, true);
-	if (n < 0)
-		goto out;
-
-	fake_cpuc->n_events = n;
-	n = collect_events(fake_cpuc, event, false);
-	if (n < 0)
-		goto out;
-
-	fake_cpuc->n_events = n;
-
-	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
-	free_fake_cpuc(fake_cpuc);
-	return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
-	struct pmu *tmp;
-	int err;
-
-	switch (event->attr.type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		break;
-
-	default:
-		return -ENOENT;
-	}
-
-	err = __x86_pmu_event_init(event);
-	if (!err) {
-		/*
-		 * we temporarily connect event to its pmu
-		 * such that validate_group() can classify
-		 * it as an x86 event using is_x86_event()
-		 */
-		tmp = event->pmu;
-		event->pmu = &pmu;
-
-		if (event->group_leader != event)
-			err = validate_group(event);
-		else
-			err = validate_event(event);
-
-		event->pmu = tmp;
-	}
-	if (err) {
-		if (event->destroy)
-			event->destroy(event);
-	}
-
-	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
-		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
-
-	return err;
-}
-
-static void refresh_pce(void *ignored)
-{
-	if (current->mm)
-		load_mm_cr4(current->mm);
-}
-
-static void x86_pmu_event_mapped(struct perf_event *event)
-{
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return;
-
-	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static void x86_pmu_event_unmapped(struct perf_event *event)
-{
-	if (!current->mm)
-		return;
-
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return;
-
-	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
-	int idx = event->hw.idx;
-
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return 0;
-
-	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-		idx -= INTEL_PMC_IDX_FIXED;
-		idx |= 1 << 30;
-	}
-
-	return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
-			      struct device_attribute *attr,
-			      char *buf)
-{
-	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
-			      struct device_attribute *attr,
-			      const char *buf, size_t count)
-{
-	unsigned long val;
-	ssize_t ret;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	if (val > 2)
-		return -EINVAL;
-
-	if (x86_pmu.attr_rdpmc_broken)
-		return -ENOTSUPP;
-
-	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
-		/*
-		 * Changing into or out of always available, aka
-		 * perf-event-bypassing mode.  This path is extremely slow,
-		 * but only root can trigger it, so it's okay.
-		 */
-		if (val == 2)
-			static_key_slow_inc(&rdpmc_always_available);
-		else
-			static_key_slow_dec(&rdpmc_always_available);
-		on_each_cpu(refresh_pce, NULL, 1);
-	}
-
-	x86_pmu.attr_rdpmc = val;
-
-	return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
-	&dev_attr_rdpmc.attr,
-	NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
-	.attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
-	&x86_pmu_attr_group,
-	&x86_pmu_format_group,
-	&x86_pmu_events_group,
-	NULL,
-};
-
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	if (x86_pmu.sched_task)
-		x86_pmu.sched_task(ctx, sched_in);
-}
-
-void perf_check_microcode(void)
-{
-	if (x86_pmu.check_microcode)
-		x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
-	.pmu_enable		= x86_pmu_enable,
-	.pmu_disable		= x86_pmu_disable,
-
-	.attr_groups		= x86_pmu_attr_groups,
-
-	.event_init		= x86_pmu_event_init,
-
-	.event_mapped		= x86_pmu_event_mapped,
-	.event_unmapped		= x86_pmu_event_unmapped,
-
-	.add			= x86_pmu_add,
-	.del			= x86_pmu_del,
-	.start			= x86_pmu_start,
-	.stop			= x86_pmu_stop,
-	.read			= x86_pmu_read,
-
-	.start_txn		= x86_pmu_start_txn,
-	.cancel_txn		= x86_pmu_cancel_txn,
-	.commit_txn		= x86_pmu_commit_txn,
-
-	.event_idx		= x86_pmu_event_idx,
-	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
-};
-
-void arch_perf_update_userpage(struct perf_event *event,
-			       struct perf_event_mmap_page *userpg, u64 now)
-{
-	struct cyc2ns_data *data;
-
-	userpg->cap_user_time = 0;
-	userpg->cap_user_time_zero = 0;
-	userpg->cap_user_rdpmc =
-		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
-	userpg->pmc_width = x86_pmu.cntval_bits;
-
-	if (!sched_clock_stable())
-		return;
-
-	data = cyc2ns_read_begin();
-
-	/*
-	 * Internal timekeeping for enabled/running/stopped times
-	 * is always in the local_clock domain.
-	 */
-	userpg->cap_user_time = 1;
-	userpg->time_mult = data->cyc2ns_mul;
-	userpg->time_shift = data->cyc2ns_shift;
-	userpg->time_offset = data->cyc2ns_offset - now;
-
-	/*
-	 * cap_user_time_zero doesn't make sense when we're using a different
-	 * time base for the records.
-	 */
-	if (event->clock == &local_clock) {
-		userpg->cap_user_time_zero = 1;
-		userpg->time_zero = data->cyc2ns_offset;
-	}
-
-	cyc2ns_read_end(data);
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-	return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct perf_callchain_entry *entry = data;
-
-	perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-	.walk_stack		= print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
-	perf_callchain_store(entry, regs->ip);
-
-	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
-	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
-	struct desc_struct *desc;
-	int idx = segment >> 3;
-
-	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-		struct ldt_struct *ldt;
-
-		if (idx > LDT_ENTRIES)
-			return 0;
-
-		/* IRQs are off, so this synchronizes with smp_store_release */
-		ldt = lockless_dereference(current->active_mm->context.ldt);
-		if (!ldt || idx > ldt->size)
-			return 0;
-
-		desc = &ldt->entries[idx];
-#else
-		return 0;
-#endif
-	} else {
-		if (idx > GDT_ENTRIES)
-			return 0;
-
-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
-	}
-
-	return get_desc_base(desc);
-}
-
-#ifdef CONFIG_IA32_EMULATION
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	/* 32-bit process in 64-bit kernel. */
-	unsigned long ss_base, cs_base;
-	struct stack_frame_ia32 frame;
-	const void __user *fp;
-
-	if (!test_thread_flag(TIF_IA32))
-		return 0;
-
-	cs_base = get_segment_base(regs->cs);
-	ss_base = get_segment_base(regs->ss);
-
-	fp = compat_ptr(ss_base + regs->bp);
-	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
-		if (!access_ok(VERIFY_READ, fp, 8))
-			break;
-
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-		if (bytes != 0)
-			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-		if (bytes != 0)
-			break;
-
-		if (!valid_user_frame(fp, sizeof(frame)))
-			break;
-
-		perf_callchain_store(entry, cs_base + frame.return_address);
-		fp = compat_ptr(ss_base + frame.next_frame);
-	}
-	pagefault_enable();
-	return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-    return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-	struct stack_frame frame;
-	const void __user *fp;
-
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
-	/*
-	 * We don't know what to do with VM86 stacks.. ignore them for now.
-	 */
-	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
-		return;
-
-	fp = (void __user *)regs->bp;
-
-	perf_callchain_store(entry, regs->ip);
-
-	if (!current->mm)
-		return;
-
-	if (perf_callchain_user32(regs, entry))
-		return;
-
-	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		if (!access_ok(VERIFY_READ, fp, 16))
-			break;
-
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
-		if (bytes != 0)
-			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
-		if (bytes != 0)
-			break;
-
-		if (!valid_user_frame(fp, sizeof(frame)))
-			break;
-
-		perf_callchain_store(entry, frame.return_address);
-		fp = (void __user *)frame.next_frame;
-	}
-	pagefault_enable();
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- *   VM86 - the good olde 16 bit days, where the linear address is
- *          20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
- *          to figure out what the 32bit base address is.
- *
- *    X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
-	/*
-	 * For IA32 we look at the GDT/LDT segment base to convert the
-	 * effective IP to a linear address.
-	 */
-
-#ifdef CONFIG_X86_32
-	/*
-	 * If we are in VM86 mode, add the segment offset to convert to a
-	 * linear address.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return 0x10 * regs->cs;
-
-	if (user_mode(regs) && regs->cs != __USER_CS)
-		return get_segment_base(regs->cs);
-#else
-	if (user_mode(regs) && !user_64bit_mode(regs) &&
-	    regs->cs != __USER32_CS)
-		return get_segment_base(regs->cs);
-#endif
-	return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
-
-	return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-	int misc = 0;
-
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
-			misc |= PERF_RECORD_MISC_GUEST_USER;
-		else
-			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-	} else {
-		if (user_mode(regs))
-			misc |= PERF_RECORD_MISC_USER;
-		else
-			misc |= PERF_RECORD_MISC_KERNEL;
-	}
-
-	if (regs->flags & PERF_EFLAGS_EXACT)
-		misc |= PERF_RECORD_MISC_EXACT_IP;
-
-	return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
-	cap->version		= x86_pmu.version;
-	cap->num_counters_gp	= x86_pmu.num_counters;
-	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
-	cap->bit_width_gp	= x86_pmu.cntval_bits;
-	cap->bit_width_fixed	= x86_pmu.cntval_bits;
-	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
-	cap->events_mask_len	= x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
-- 
cgit 


From 39b0332a215832ce3a8f8f57344da4a64370e3ca Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:05 +0100
Subject: perf/x86: Move perf_event_amd.c ........... => x86/events/amd/core.c

We distribute those in vendor subdirs, starting with .../events/amd/.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile             |   2 +
 arch/x86/events/amd/core.c           | 731 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile         |   2 +-
 arch/x86/kernel/cpu/perf_event_amd.c | 731 -----------------------------------
 4 files changed, 734 insertions(+), 732 deletions(-)
 create mode 100644 arch/x86/events/amd/core.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_amd.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 3fad3ce1bbab..e0560b6dd77b 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1 +1,3 @@
 obj-y			+= core.o
+
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644
index 000000000000..51b16583679c
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,731 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+	return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+	int offset;
+
+	if (!index)
+		return index;
+
+	if (eventsel)
+		offset = event_offsets[index];
+	else
+		offset = count_offsets[index];
+
+	if (offset)
+		return offset;
+
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		offset = index;
+	else
+		offset = index << 1;
+
+	if (eventsel)
+		event_offsets[index] = offset;
+	else
+		count_offsets[index] = offset;
+
+	return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+	if (event->attr.exclude_host && event->attr.exclude_guest)
+		/*
+		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
+		 * and will count in both modes. We don't want to count in that
+		 * case so we emulate no-counting by setting US = OS = 0.
+		 */
+		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+				      ARCH_PERFMON_EVENTSEL_OS);
+	else if (event->attr.exclude_host)
+		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+	else if (event->attr.exclude_guest)
+		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+	return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+	return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+
+	return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+	int ret;
+
+	/* pass precise event sampling to ibs: */
+	if (event->attr.precise_ip && get_ibs_caps())
+		return -ENOENT;
+
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	ret = x86_pmu_hw_config(event);
+	if (ret)
+		return ret;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+					   struct perf_event *event)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+	int i;
+
+	/*
+	 * need to scan whole list because event may not have
+	 * been assigned during scheduling
+	 *
+	 * no race condition possible because event can only
+	 * be removed on one CPU at a time AND PMU is disabled
+	 * when we come here
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (cmpxchg(nb->owners + i, event, NULL) == event)
+			break;
+	}
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			       struct event_constraint *c)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct amd_nb *nb = cpuc->amd_nb;
+	struct perf_event *old;
+	int idx, new = -1;
+
+	if (!c)
+		c = &unconstrained;
+
+	if (cpuc->is_fake)
+		return c;
+
+	/*
+	 * detect if already present, if so reuse
+	 *
+	 * cannot merge with actual allocation
+	 * because of possible holes
+	 *
+	 * event can already be present yet not assigned (in hwc->idx)
+	 * because of successive calls to x86_schedule_events() from
+	 * hw_perf_group_sched_in() without hw_perf_enable()
+	 */
+	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+		if (new == -1 || hwc->idx == idx)
+			/* assign free slot, prefer hwc->idx */
+			old = cmpxchg(nb->owners + idx, NULL, event);
+		else if (nb->owners[idx] == event)
+			/* event already present */
+			old = event;
+		else
+			continue;
+
+		if (old && old != event)
+			continue;
+
+		/* reassign to this slot */
+		if (new != -1)
+			cmpxchg(nb->owners + new, event, NULL);
+		new = idx;
+
+		/* already present, reuse */
+		if (old == event)
+			break;
+	}
+
+	if (new == -1)
+		return &emptyconstraint;
+
+	return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+	struct amd_nb *nb;
+	int i;
+
+	nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+	if (!nb)
+		return NULL;
+
+	nb->nb_id = -1;
+
+	/*
+	 * initialize all possible NB constraints
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		__set_bit(i, nb->event_constraints[i].idxmsk);
+		nb->event_constraints[i].weight = 1;
+	}
+	return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	WARN_ON_ONCE(cpuc->amd_nb);
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return NOTIFY_OK;
+
+	cpuc->amd_nb = amd_alloc_nb(cpu);
+	if (!cpuc->amd_nb)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+	struct amd_nb *nb;
+	int i, nb_id;
+
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	nb_id = amd_get_nb_id(cpu);
+	WARN_ON_ONCE(nb_id == BAD_APICID);
+
+	for_each_online_cpu(i) {
+		nb = per_cpu(cpu_hw_events, i).amd_nb;
+		if (WARN_ON_ONCE(!nb))
+			continue;
+
+		if (nb->nb_id == nb_id) {
+			*onln = cpuc->amd_nb;
+			cpuc->amd_nb = nb;
+			break;
+		}
+	}
+
+	cpuc->amd_nb->nb_id = nb_id;
+	cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	if (cpuhw->amd_nb) {
+		struct amd_nb *nb = cpuhw->amd_nb;
+
+		if (nb->nb_id == -1 || --nb->refcnt == 0)
+			kfree(nb);
+
+		cpuhw->amd_nb = NULL;
+	}
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	/*
+	 * if not NB event or no NB, then no constraints
+	 */
+	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+		return &unconstrained;
+
+	return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+				      struct perf_event *event)
+{
+	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+		__amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *amd_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
+
+#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS		0x000000C0ULL
+#define AMD_EVENT_DE		0x000000D0ULL
+#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000	FP	PERF_CTL[5:3]
+ * 0x010	FP	PERF_CTL[5:3]
+ * 0x020	LS	PERF_CTL[5:0]
+ * 0x030	LS	PERF_CTL[5:0]
+ * 0x040	DC	PERF_CTL[5:0]
+ * 0x050	DC	PERF_CTL[5:0]
+ * 0x060	CU	PERF_CTL[2:0]
+ * 0x070	CU	PERF_CTL[2:0]
+ * 0x080	IC/DE	PERF_CTL[2:0]
+ * 0x090	IC/DE	PERF_CTL[2:0]
+ * 0x0A0	---
+ * 0x0B0	---
+ * 0x0C0	EX/LS	PERF_CTL[5:0]
+ * 0x0D0	DE	PERF_CTL[2:0]
+ * 0x0E0	NB	NB_PERF_CTL[3:0]
+ * 0x0F0	NB	NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003	FP	PERF_CTL[3]
+ * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B	FP	PERF_CTL[3]
+ * 0x00D	FP	PERF_CTL[3]
+ * 0x023	DE	PERF_CTL[2:0]
+ * 0x02D	LS	PERF_CTL[3]
+ * 0x02E	LS	PERF_CTL[3,0]
+ * 0x031	LS	PERF_CTL[2:0] (**)
+ * 0x043	CU	PERF_CTL[2:0]
+ * 0x045	CU	PERF_CTL[2:0]
+ * 0x046	CU	PERF_CTL[2:0]
+ * 0x054	CU	PERF_CTL[2:0]
+ * 0x055	CU	PERF_CTL[2:0]
+ * 0x08F	IC	PERF_CTL[0]
+ * 0x187	DE	PERF_CTL[0]
+ * 0x188	DE	PERF_CTL[0]
+ * 0x0DB	EX	PERF_CTL[5:0]
+ * 0x0DC	LS	PERF_CTL[5:0]
+ * 0x0DD	LS	PERF_CTL[5:0]
+ * 0x0DE	LS	PERF_CTL[5:0]
+ * 0x0DF	LS	PERF_CTL[5:0]
+ * 0x1C0	EX	PERF_CTL[5:3]
+ * 0x1D6	EX	PERF_CTL[5:0]
+ * 0x1D8	EX	PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int event_code = amd_get_event_code(hwc);
+
+	switch (event_code & AMD_EVENT_TYPE_MASK) {
+	case AMD_EVENT_FP:
+		switch (event_code) {
+		case 0x000:
+			if (!(hwc->config & 0x0000F000ULL))
+				break;
+			if (!(hwc->config & 0x00000F00ULL))
+				break;
+			return &amd_f15_PMC3;
+		case 0x004:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				break;
+			return &amd_f15_PMC3;
+		case 0x003:
+		case 0x00B:
+		case 0x00D:
+			return &amd_f15_PMC3;
+		}
+		return &amd_f15_PMC53;
+	case AMD_EVENT_LS:
+	case AMD_EVENT_DC:
+	case AMD_EVENT_EX_LS:
+		switch (event_code) {
+		case 0x023:
+		case 0x043:
+		case 0x045:
+		case 0x046:
+		case 0x054:
+		case 0x055:
+			return &amd_f15_PMC20;
+		case 0x02D:
+			return &amd_f15_PMC3;
+		case 0x02E:
+			return &amd_f15_PMC30;
+		case 0x031:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				return &amd_f15_PMC20;
+			return &emptyconstraint;
+		case 0x1C0:
+			return &amd_f15_PMC53;
+		default:
+			return &amd_f15_PMC50;
+		}
+	case AMD_EVENT_CU:
+	case AMD_EVENT_IC_DE:
+	case AMD_EVENT_DE:
+		switch (event_code) {
+		case 0x08F:
+		case 0x187:
+		case 0x188:
+			return &amd_f15_PMC0;
+		case 0x0DB ... 0x0DF:
+		case 0x1D6:
+		case 0x1D8:
+			return &amd_f15_PMC50;
+		default:
+			return &amd_f15_PMC20;
+		}
+	case AMD_EVENT_NB:
+		/* moved to perf_event_amd_uncore.c */
+		return &emptyconstraint;
+	default:
+		return &emptyconstraint;
+	}
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+		    (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+	.name			= "AMD",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= x86_pmu_enable_all,
+	.enable			= x86_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= amd_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.addr_offset            = amd_pmu_addr_offset,
+	.event_map		= amd_pmu_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= AMD64_NUM_COUNTERS,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= amd_get_event_constraints,
+	.put_event_constraints	= amd_put_event_constraints,
+
+	.format_attrs		= amd_format_attr,
+	.events_sysfs_show	= amd_event_sysfs_show,
+
+	.cpu_prepare		= amd_pmu_cpu_prepare,
+	.cpu_starting		= amd_pmu_cpu_starting,
+	.cpu_dead		= amd_pmu_cpu_dead,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		return 0;
+
+	switch (boot_cpu_data.x86) {
+	case 0x15:
+		pr_cont("Fam15h ");
+		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+		break;
+
+	default:
+		pr_err("core perfctr but no constraints; unknown hardware!\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * If core performance counter extensions exists, we must use
+	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+	 * amd_pmu_addr_offset().
+	 */
+	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
+	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
+	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
+
+	pr_cont("core perfctr, ");
+	return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+	int ret;
+
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
+	x86_pmu = amd_pmu;
+
+	ret = amd_core_pmu_init();
+	if (ret)
+		return ret;
+
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
+	return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	cpuc->perf_ctr_virt_mask = 0;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * We only mask out the Host-only bit so that host-only counting works
+	 * when SVM is disabled. If someone sets up a guest-only counter when
+	 * SVM is disabled the Guest-only bits still gets set and the counter
+	 * will not count anything.
+	 */
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 77000d54fcd1..d549b02953ca 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o perf_event_amd_uncore.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
deleted file mode 100644
index 58610539b048..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
-		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
-  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-	return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- *   4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- *   6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
-	int offset;
-
-	if (!index)
-		return index;
-
-	if (eventsel)
-		offset = event_offsets[index];
-	else
-		offset = count_offsets[index];
-
-	if (offset)
-		return offset;
-
-	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		offset = index;
-	else
-		offset = index << 1;
-
-	if (eventsel)
-		event_offsets[index] = offset;
-	else
-		count_offsets[index] = offset;
-
-	return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
-	if (event->attr.exclude_host && event->attr.exclude_guest)
-		/*
-		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
-		 * and will count in both modes. We don't want to count in that
-		 * case so we emulate no-counting by setting US = OS = 0.
-		 */
-		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
-				      ARCH_PERFMON_EVENTSEL_OS);
-	else if (event->attr.exclude_host)
-		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
-	else if (event->attr.exclude_guest)
-		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
-	return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
-	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
-	return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
-	struct amd_nb *nb = cpuc->amd_nb;
-
-	return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
-	int ret;
-
-	/* pass precise event sampling to ibs: */
-	if (event->attr.precise_ip && get_ibs_caps())
-		return -ENOENT;
-
-	if (has_branch_stack(event))
-		return -EOPNOTSUPP;
-
-	ret = x86_pmu_hw_config(event);
-	if (ret)
-		return ret;
-
-	if (event->attr.type == PERF_TYPE_RAW)
-		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
-	return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
-					   struct perf_event *event)
-{
-	struct amd_nb *nb = cpuc->amd_nb;
-	int i;
-
-	/*
-	 * need to scan whole list because event may not have
-	 * been assigned during scheduling
-	 *
-	 * no race condition possible because event can only
-	 * be removed on one CPU at a time AND PMU is disabled
-	 * when we come here
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (cmpxchg(nb->owners + i, event, NULL) == event)
-			break;
-	}
-}
-
- /*
-  * AMD64 NorthBridge events need special treatment because
-  * counter access needs to be synchronized across all cores
-  * of a package. Refer to BKDG section 3.12
-  *
-  * NB events are events measuring L3 cache, Hypertransport
-  * traffic. They are identified by an event code >= 0xe00.
-  * They measure events on the NorthBride which is shared
-  * by all cores on a package. NB events are counted on a
-  * shared set of counters. When a NB event is programmed
-  * in a counter, the data actually comes from a shared
-  * counter. Thus, access to those counters needs to be
-  * synchronized.
-  *
-  * We implement the synchronization such that no two cores
-  * can be measuring NB events using the same counters. Thus,
-  * we maintain a per-NB allocation table. The available slot
-  * is propagated using the event_constraint structure.
-  *
-  * We provide only one choice for each NB event based on
-  * the fact that only NB events have restrictions. Consequently,
-  * if a counter is available, there is a guarantee the NB event
-  * will be assigned to it. If no slot is available, an empty
-  * constraint is returned and scheduling will eventually fail
-  * for this event.
-  *
-  * Note that all cores attached the same NB compete for the same
-  * counters to host NB events, this is why we use atomic ops. Some
-  * multi-chip CPUs may have more than one NB.
-  *
-  * Given that resources are allocated (cmpxchg), they must be
-  * eventually freed for others to use. This is accomplished by
-  * calling __amd_put_nb_event_constraints()
-  *
-  * Non NB events are not impacted by this restriction.
-  */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-			       struct event_constraint *c)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct amd_nb *nb = cpuc->amd_nb;
-	struct perf_event *old;
-	int idx, new = -1;
-
-	if (!c)
-		c = &unconstrained;
-
-	if (cpuc->is_fake)
-		return c;
-
-	/*
-	 * detect if already present, if so reuse
-	 *
-	 * cannot merge with actual allocation
-	 * because of possible holes
-	 *
-	 * event can already be present yet not assigned (in hwc->idx)
-	 * because of successive calls to x86_schedule_events() from
-	 * hw_perf_group_sched_in() without hw_perf_enable()
-	 */
-	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
-		if (new == -1 || hwc->idx == idx)
-			/* assign free slot, prefer hwc->idx */
-			old = cmpxchg(nb->owners + idx, NULL, event);
-		else if (nb->owners[idx] == event)
-			/* event already present */
-			old = event;
-		else
-			continue;
-
-		if (old && old != event)
-			continue;
-
-		/* reassign to this slot */
-		if (new != -1)
-			cmpxchg(nb->owners + new, event, NULL);
-		new = idx;
-
-		/* already present, reuse */
-		if (old == event)
-			break;
-	}
-
-	if (new == -1)
-		return &emptyconstraint;
-
-	return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
-	struct amd_nb *nb;
-	int i;
-
-	nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
-	if (!nb)
-		return NULL;
-
-	nb->nb_id = -1;
-
-	/*
-	 * initialize all possible NB constraints
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		__set_bit(i, nb->event_constraints[i].idxmsk);
-		nb->event_constraints[i].weight = 1;
-	}
-	return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	WARN_ON_ONCE(cpuc->amd_nb);
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return NOTIFY_OK;
-
-	cpuc->amd_nb = amd_alloc_nb(cpu);
-	if (!cpuc->amd_nb)
-		return NOTIFY_BAD;
-
-	return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-	struct amd_nb *nb;
-	int i, nb_id;
-
-	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return;
-
-	nb_id = amd_get_nb_id(cpu);
-	WARN_ON_ONCE(nb_id == BAD_APICID);
-
-	for_each_online_cpu(i) {
-		nb = per_cpu(cpu_hw_events, i).amd_nb;
-		if (WARN_ON_ONCE(!nb))
-			continue;
-
-		if (nb->nb_id == nb_id) {
-			*onln = cpuc->amd_nb;
-			cpuc->amd_nb = nb;
-			break;
-		}
-	}
-
-	cpuc->amd_nb->nb_id = nb_id;
-	cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
-	struct cpu_hw_events *cpuhw;
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return;
-
-	cpuhw = &per_cpu(cpu_hw_events, cpu);
-
-	if (cpuhw->amd_nb) {
-		struct amd_nb *nb = cpuhw->amd_nb;
-
-		if (nb->nb_id == -1 || --nb->refcnt == 0)
-			kfree(nb);
-
-		cpuhw->amd_nb = NULL;
-	}
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	/*
-	 * if not NB event or no NB, then no constraints
-	 */
-	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
-		return &unconstrained;
-
-	return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-				      struct perf_event *event)
-{
-	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
-		__amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *amd_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
-
-#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS		0x000000C0ULL
-#define AMD_EVENT_DE		0x000000D0ULL
-#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000	FP	PERF_CTL[5:3]
- * 0x010	FP	PERF_CTL[5:3]
- * 0x020	LS	PERF_CTL[5:0]
- * 0x030	LS	PERF_CTL[5:0]
- * 0x040	DC	PERF_CTL[5:0]
- * 0x050	DC	PERF_CTL[5:0]
- * 0x060	CU	PERF_CTL[2:0]
- * 0x070	CU	PERF_CTL[2:0]
- * 0x080	IC/DE	PERF_CTL[2:0]
- * 0x090	IC/DE	PERF_CTL[2:0]
- * 0x0A0	---
- * 0x0B0	---
- * 0x0C0	EX/LS	PERF_CTL[5:0]
- * 0x0D0	DE	PERF_CTL[2:0]
- * 0x0E0	NB	NB_PERF_CTL[3:0]
- * 0x0F0	NB	NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003	FP	PERF_CTL[3]
- * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B	FP	PERF_CTL[3]
- * 0x00D	FP	PERF_CTL[3]
- * 0x023	DE	PERF_CTL[2:0]
- * 0x02D	LS	PERF_CTL[3]
- * 0x02E	LS	PERF_CTL[3,0]
- * 0x031	LS	PERF_CTL[2:0] (**)
- * 0x043	CU	PERF_CTL[2:0]
- * 0x045	CU	PERF_CTL[2:0]
- * 0x046	CU	PERF_CTL[2:0]
- * 0x054	CU	PERF_CTL[2:0]
- * 0x055	CU	PERF_CTL[2:0]
- * 0x08F	IC	PERF_CTL[0]
- * 0x187	DE	PERF_CTL[0]
- * 0x188	DE	PERF_CTL[0]
- * 0x0DB	EX	PERF_CTL[5:0]
- * 0x0DC	LS	PERF_CTL[5:0]
- * 0x0DD	LS	PERF_CTL[5:0]
- * 0x0DE	LS	PERF_CTL[5:0]
- * 0x0DF	LS	PERF_CTL[5:0]
- * 0x1C0	EX	PERF_CTL[5:3]
- * 0x1D6	EX	PERF_CTL[5:0]
- * 0x1D8	EX	PERF_CTL[5:0]
- *
- * (*)  depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
-			       struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int event_code = amd_get_event_code(hwc);
-
-	switch (event_code & AMD_EVENT_TYPE_MASK) {
-	case AMD_EVENT_FP:
-		switch (event_code) {
-		case 0x000:
-			if (!(hwc->config & 0x0000F000ULL))
-				break;
-			if (!(hwc->config & 0x00000F00ULL))
-				break;
-			return &amd_f15_PMC3;
-		case 0x004:
-			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-				break;
-			return &amd_f15_PMC3;
-		case 0x003:
-		case 0x00B:
-		case 0x00D:
-			return &amd_f15_PMC3;
-		}
-		return &amd_f15_PMC53;
-	case AMD_EVENT_LS:
-	case AMD_EVENT_DC:
-	case AMD_EVENT_EX_LS:
-		switch (event_code) {
-		case 0x023:
-		case 0x043:
-		case 0x045:
-		case 0x046:
-		case 0x054:
-		case 0x055:
-			return &amd_f15_PMC20;
-		case 0x02D:
-			return &amd_f15_PMC3;
-		case 0x02E:
-			return &amd_f15_PMC30;
-		case 0x031:
-			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-				return &amd_f15_PMC20;
-			return &emptyconstraint;
-		case 0x1C0:
-			return &amd_f15_PMC53;
-		default:
-			return &amd_f15_PMC50;
-		}
-	case AMD_EVENT_CU:
-	case AMD_EVENT_IC_DE:
-	case AMD_EVENT_DE:
-		switch (event_code) {
-		case 0x08F:
-		case 0x187:
-		case 0x188:
-			return &amd_f15_PMC0;
-		case 0x0DB ... 0x0DF:
-		case 0x1D6:
-		case 0x1D8:
-			return &amd_f15_PMC50;
-		default:
-			return &amd_f15_PMC20;
-		}
-	case AMD_EVENT_NB:
-		/* moved to perf_event_amd_uncore.c */
-		return &emptyconstraint;
-	default:
-		return &emptyconstraint;
-	}
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
-	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
-		    (config & AMD64_EVENTSEL_EVENT) >> 24;
-
-	return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
-	.name			= "AMD",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= x86_pmu_enable_all,
-	.enable			= x86_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= amd_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_K7_EVNTSEL0,
-	.perfctr		= MSR_K7_PERFCTR0,
-	.addr_offset            = amd_pmu_addr_offset,
-	.event_map		= amd_pmu_event_map,
-	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= AMD64_NUM_COUNTERS,
-	.cntval_bits		= 48,
-	.cntval_mask		= (1ULL << 48) - 1,
-	.apic			= 1,
-	/* use highest bit to detect overflow */
-	.max_period		= (1ULL << 47) - 1,
-	.get_event_constraints	= amd_get_event_constraints,
-	.put_event_constraints	= amd_put_event_constraints,
-
-	.format_attrs		= amd_format_attr,
-	.events_sysfs_show	= amd_event_sysfs_show,
-
-	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_starting		= amd_pmu_cpu_starting,
-	.cpu_dead		= amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		return 0;
-
-	switch (boot_cpu_data.x86) {
-	case 0x15:
-		pr_cont("Fam15h ");
-		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
-		break;
-
-	default:
-		pr_err("core perfctr but no constraints; unknown hardware!\n");
-		return -ENODEV;
-	}
-
-	/*
-	 * If core performance counter extensions exists, we must use
-	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
-	 * amd_pmu_addr_offset().
-	 */
-	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
-	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
-	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
-
-	pr_cont("core perfctr, ");
-	return 0;
-}
-
-__init int amd_pmu_init(void)
-{
-	int ret;
-
-	/* Performance-monitoring supported from K7 and later: */
-	if (boot_cpu_data.x86 < 6)
-		return -ENODEV;
-
-	x86_pmu = amd_pmu;
-
-	ret = amd_core_pmu_init();
-	if (ret)
-		return ret;
-
-	/* Events are common for all AMDs */
-	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	cpuc->perf_ctr_virt_mask = 0;
-
-	/* Reload all events */
-	x86_pmu_disable_all();
-	x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * We only mask out the Host-only bit so that host-only counting works
-	 * when SVM is disabled. If someone sets up a guest-only counter when
-	 * SVM is disabled the Guest-only bits still gets set and the counter
-	 * will not count anything.
-	 */
-	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-	/* Reload all events */
-	x86_pmu_disable_all();
-	x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
-- 
cgit 


From 218cfe4ed8885f988d67ac5f52efeff7233ae1f2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:06 +0100
Subject: perf/x86: Move perf_event_amd_ibs.c ....... => x86/events/amd/ibs.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                 |   1 +
 arch/x86/events/amd/ibs.c                | 959 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile             |   2 +-
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 959 -------------------------------
 4 files changed, 961 insertions(+), 960 deletions(-)
 create mode 100644 arch/x86/events/amd/ibs.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_amd_ibs.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index e0560b6dd77b..88f787350bf0 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,3 +1,4 @@
 obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644
index 000000000000..a8abd082d932
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,959 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
+
+enum ibs_states {
+	IBS_ENABLED	= 0,
+	IBS_STARTED	= 1,
+	IBS_STOPPING	= 2,
+
+	IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+	struct perf_event	*event;
+	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+	struct pmu			pmu;
+	unsigned int			msr;
+	u64				config_mask;
+	u64				cnt_mask;
+	u64				enable_mask;
+	u64				valid_mask;
+	u64				max_period;
+	unsigned long			offset_mask[1];
+	int				offset_max;
+	struct cpu_perf_ibs __percpu	*pcpu;
+
+	struct attribute		**format_attrs;
+	struct attribute_group		format_group;
+	const struct attribute_group	*attr_groups[2];
+
+	u64				(*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+	u32		size;
+	union {
+		u32	data[0];	/* data buffer starts here */
+		u32	caps;
+	};
+	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int overflow = 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (unlikely(left < (s64)min)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	/*
+	 * If the hw period that triggers the sw overflow is too short
+	 * we might hit the irq handler. This biases the results.
+	 * Thus we shorten the next-to-last period and set the last
+	 * period to the max period.
+	 */
+	if (left > max) {
+		left -= max;
+		if (left > max)
+			left = max;
+		else if (left < min)
+			left = min;
+	}
+
+	*hw_period = (u64)left;
+
+	return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - width;
+	u64 prev_raw_count;
+	u64 delta;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+	prev_raw_count = local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		return 0;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+	if (perf_ibs_fetch.pmu.type == type)
+		return &perf_ibs_fetch;
+	if (perf_ibs_op.pmu.type == type)
+		return &perf_ibs_op;
+	return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+	switch (event->attr.precise_ip) {
+	case 0:
+		return -ENOENT;
+	case 1:
+	case 2:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		switch (event->attr.config) {
+		case PERF_COUNT_HW_CPU_CYCLES:
+			*config = 0;
+			return 0;
+		}
+		break;
+	case PERF_TYPE_RAW:
+		switch (event->attr.config) {
+		case 0x0076:
+			*config = 0;
+			return 0;
+		case 0x00C1:
+			*config = IBS_OP_CNT_CTL;
+			return 0;
+		}
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+	.exclude_user	= 1,
+	.exclude_kernel	= 1,
+	.exclude_hv	= 1,
+	.exclude_idle	= 1,
+	.exclude_host	= 1,
+	.exclude_guest	= 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs;
+	u64 max_cnt, config;
+	int ret;
+
+	perf_ibs = get_ibs_pmu(event->attr.type);
+	if (perf_ibs) {
+		config = event->attr.config;
+	} else {
+		perf_ibs = &perf_ibs_op;
+		ret = perf_ibs_precise_event(event, &config);
+		if (ret)
+			return ret;
+	}
+
+	if (event->pmu != &perf_ibs->pmu)
+		return -ENOENT;
+
+	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+		return -EINVAL;
+
+	if (config & ~perf_ibs->config_mask)
+		return -EINVAL;
+
+	if (hwc->sample_period) {
+		if (config & perf_ibs->cnt_mask)
+			/* raw max_cnt may not be set */
+			return -EINVAL;
+		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+			/*
+			 * lower 4 bits can not be set in ibs max cnt,
+			 * but allowing it in case we adjust the
+			 * sample period to set a frequency.
+			 */
+			return -EINVAL;
+		hwc->sample_period &= ~0x0FULL;
+		if (!hwc->sample_period)
+			hwc->sample_period = 0x10;
+	} else {
+		max_cnt = config & perf_ibs->cnt_mask;
+		config &= ~perf_ibs->cnt_mask;
+		event->attr.sample_period = max_cnt << 4;
+		hwc->sample_period = event->attr.sample_period;
+	}
+
+	if (!hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * If we modify hwc->sample_period, we also need to update
+	 * hwc->last_period and hwc->period_left.
+	 */
+	hwc->last_period = hwc->sample_period;
+	local64_set(&hwc->period_left, hwc->sample_period);
+
+	hwc->config_base = perf_ibs->msr;
+	hwc->config = config;
+
+	return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+			       struct hw_perf_event *hwc, u64 *period)
+{
+	int overflow;
+
+	/* ignore lower 4 bits in min count: */
+	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+	local64_set(&hwc->prev_count, 0);
+
+	return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+	return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+	u64 count = 0;
+
+	if (config & IBS_OP_VAL)
+		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+	if (ibs_caps & IBS_CAPS_RDWROPCNT)
+		count += (config & IBS_OP_CUR_CNT) >> 32;
+
+	return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+		      u64 *config)
+{
+	u64 count = perf_ibs->get_count(*config);
+
+	/*
+	 * Set width to 64 since we do not overflow on max width but
+	 * instead on max count. In perf_ibs_set_period() we clear
+	 * prev count manually on overflow.
+	 */
+	while (!perf_event_try_update(event, count, 64)) {
+		rdmsrl(event->hw.config_base, *config);
+		count = perf_ibs->get_count(*config);
+	}
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+					 struct hw_perf_event *hwc, u64 config)
+{
+	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+					  struct hw_perf_event *hwc, u64 config)
+{
+	config &= ~perf_ibs->cnt_mask;
+	wrmsrl(hwc->config_base, config);
+	config &= ~perf_ibs->enable_mask;
+	wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 period;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	perf_ibs_set_period(perf_ibs, hwc, &period);
+	set_bit(IBS_STARTED, pcpu->state);
+	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 config;
+	int stopping;
+
+	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+		return;
+
+	rdmsrl(hwc->config_base, config);
+
+	if (stopping) {
+		set_bit(IBS_STOPPING, pcpu->state);
+		perf_ibs_disable_event(perf_ibs, hwc, config);
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	/*
+	 * Clear valid bit to not count rollovers on update, rollovers
+	 * are only updated in the irq handler.
+	 */
+	config &= ~perf_ibs->valid_mask;
+
+	perf_ibs_event_update(perf_ibs, event, &config);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+		return -ENOSPC;
+
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	pcpu->event = event;
+
+	if (flags & PERF_EF_START)
+		perf_ibs_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+		return;
+
+	perf_ibs_stop(event, PERF_EF_UPDATE);
+
+	pcpu->event = NULL;
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,	"config:57");
+PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+	&format_attr_rand_en.attr,
+	NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+	NULL,	/* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+	NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSFETCHCTL,
+	.config_mask		= IBS_FETCH_CONFIG_MASK,
+	.cnt_mask		= IBS_FETCH_MAX_CNT,
+	.enable_mask		= IBS_FETCH_ENABLE,
+	.valid_mask		= IBS_FETCH_VAL,
+	.max_period		= IBS_FETCH_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
+	.format_attrs		= ibs_fetch_format_attrs,
+
+	.get_count		= get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSOPCTL,
+	.config_mask		= IBS_OP_CONFIG_MASK,
+	.cnt_mask		= IBS_OP_MAX_CNT,
+	.enable_mask		= IBS_OP_ENABLE,
+	.valid_mask		= IBS_OP_VAL,
+	.max_period		= IBS_OP_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
+	.format_attrs		= ibs_op_format_attrs,
+
+	.get_count		= get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	struct perf_event *event = pcpu->event;
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	struct perf_ibs_data ibs_data;
+	int offset, size, check_rip, offset_max, throttle = 0;
+	unsigned int msr;
+	u64 *buf, *config, period;
+
+	if (!test_bit(IBS_STARTED, pcpu->state)) {
+		/*
+		 * Catch spurious interrupts after stopping IBS: After
+		 * disabling IBS there could be still incoming NMIs
+		 * with samples that even have the valid bit cleared.
+		 * Mark all this NMIs as handled.
+		 */
+		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+	}
+
+	msr = hwc->config_base;
+	buf = ibs_data.regs;
+	rdmsrl(msr, *buf);
+	if (!(*buf++ & perf_ibs->valid_mask))
+		return 0;
+
+	config = &ibs_data.regs[0];
+	perf_ibs_event_update(perf_ibs, event, config);
+	perf_sample_data_init(&data, 0, hwc->last_period);
+	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+		goto out;	/* no sw counter overflow */
+
+	ibs_data.caps = ibs_caps;
+	size = 1;
+	offset = 1;
+	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+	if (event->attr.sample_type & PERF_SAMPLE_RAW)
+		offset_max = perf_ibs->offset_max;
+	else if (check_rip)
+		offset_max = 2;
+	else
+		offset_max = 1;
+	do {
+		rdmsrl(msr + offset, *buf++);
+		size++;
+		offset = find_next_bit(perf_ibs->offset_mask,
+				       perf_ibs->offset_max,
+				       offset + 1);
+	} while (offset < offset_max);
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		/*
+		 * Read IbsBrTarget and IbsOpData4 separately
+		 * depending on their availability.
+		 * Can't add to offset_max as they are staggered
+		 */
+		if (ibs_caps & IBS_CAPS_BRNTRGT) {
+			rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+			size++;
+		}
+		if (ibs_caps & IBS_CAPS_OPDATA4) {
+			rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+			size++;
+		}
+	}
+	ibs_data.size = sizeof(u64) * size;
+
+	regs = *iregs;
+	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+	} else {
+		set_linear_ip(&regs, ibs_data.regs[1]);
+		regs.flags |= PERF_EFLAGS_EXACT;
+	}
+
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = sizeof(u32) + ibs_data.size;
+		raw.data = ibs_data.data;
+		data.raw = &raw;
+	}
+
+	throttle = perf_event_overflow(event, &data, &regs);
+out:
+	if (throttle)
+		perf_ibs_disable_event(perf_ibs, hwc, *config);
+	else
+		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+
+	return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	int handled = 0;
+
+	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+	struct cpu_perf_ibs __percpu *pcpu;
+	int ret;
+
+	pcpu = alloc_percpu(struct cpu_perf_ibs);
+	if (!pcpu)
+		return -ENOMEM;
+
+	perf_ibs->pcpu = pcpu;
+
+	/* register attributes */
+	if (perf_ibs->format_attrs[0]) {
+		memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+		perf_ibs->format_group.name	= "format";
+		perf_ibs->format_group.attrs	= perf_ibs->format_attrs;
+
+		memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+		perf_ibs->attr_groups[0]	= &perf_ibs->format_group;
+		perf_ibs->pmu.attr_groups	= perf_ibs->attr_groups;
+	}
+
+	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+	if (ret) {
+		perf_ibs->pcpu = NULL;
+		free_percpu(pcpu);
+	}
+
+	return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+	struct attribute **attr = ibs_op_format_attrs;
+
+	if (!ibs_caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+	if (ibs_caps & IBS_CAPS_OPCNT) {
+		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+		*attr++ = &format_attr_cnt_ctl.attr;
+	}
+	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+	return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+	u32 caps;
+	unsigned int max_level;
+
+	if (!boot_cpu_has(X86_FEATURE_IBS))
+		return 0;
+
+	/* check IBS cpuid feature flags */
+	max_level = cpuid_eax(0x80000000);
+	if (max_level < IBS_CPUID_FEATURES)
+		return IBS_CAPS_DEFAULT;
+
+	caps = cpuid_eax(IBS_CPUID_FEATURES);
+	if (!(caps & IBS_CAPS_AVAIL))
+		/* cpuid flags not valid */
+		return IBS_CAPS_DEFAULT;
+
+	return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+	return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+	int offset;
+	u64 val;
+	int valid = 0;
+
+	preempt_disable();
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	if (!get_eilvt(offset)) {
+		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	valid = 1;
+out:
+	preempt_enable();
+
+	return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+	struct pci_dev *cpu_cfg;
+	int nodes;
+	u32 value = 0;
+
+	nodes = 0;
+	cpu_cfg = NULL;
+	do {
+		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
+					 cpu_cfg);
+		if (!cpu_cfg)
+			break;
+		++nodes;
+		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+				       | IBSCTL_LVT_OFFSET_VALID);
+		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+			pci_dev_put(cpu_cfg);
+			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+				 value);
+			return -EINVAL;
+		}
+	} while (1);
+
+	if (!nodes) {
+		pr_debug("No CPU node configured for IBS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+	int offset;
+	int ret;
+
+	preempt_disable();
+	/* find the next free available EILVT entry, skip offset 0 */
+	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+		if (get_eilvt(offset))
+			break;
+	}
+	preempt_enable();
+
+	if (offset == APIC_EILVT_NR_MAX) {
+		pr_debug("No EILVT entry available\n");
+		return;
+	}
+
+	ret = setup_ibs_ctl(offset);
+	if (ret)
+		goto out;
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	pr_info("IBS: LVT offset %d assigned\n", offset);
+
+	return;
+out:
+	preempt_disable();
+	put_eilvt(offset);
+	preempt_enable();
+	return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+	/*
+	 * Force LVT offset assignment for family 10h: The offsets are
+	 * not assigned by the BIOS for this family, so the OS is
+	 * responsible for doing it. If the OS assignment fails, fall
+	 * back to BIOS settings and try to setup this.
+	 */
+	if (boot_cpu_data.x86 == 0x10)
+		force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID))
+		return -EINVAL;
+
+	return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset < 0)
+		goto failed;
+
+	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+		return;
+failed:
+	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+		smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset >= 0)
+		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+	clear_APIC_ibs(NULL);
+	return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+	ibs_eilvt_setup();
+	setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+	.resume		= perf_ibs_resume,
+	.suspend	= perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+	register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		setup_APIC_ibs(NULL);
+		break;
+	case CPU_DYING:
+		clear_APIC_ibs(NULL);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+	u32 caps;
+	int ret = -EINVAL;
+
+	caps = __get_ibs_caps();
+	if (!caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	ibs_eilvt_setup();
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	perf_ibs_pm_init();
+	cpu_notifier_register_begin();
+	ibs_caps = caps;
+	/* make ibs_caps visible to other cpus: */
+	smp_mb();
+	smp_call_function(setup_APIC_ibs, NULL, 1);
+	__perf_cpu_notifier(perf_ibs_cpu_notifier);
+	cpu_notifier_register_done();
+
+	ret = perf_event_ibs_init();
+out:
+	if (ret)
+		pr_err("Failed to setup IBS, %d\n", ret);
+	return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d549b02953ca..dddba2225ccd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
 
-obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 obj-$(CONFIG_HYPERVISOR_GUEST)		+= vmware.o hypervisor.o mshyperv.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
deleted file mode 100644
index aa12f9509cfb..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
-
-enum ibs_states {
-	IBS_ENABLED	= 0,
-	IBS_STARTED	= 1,
-	IBS_STOPPING	= 2,
-
-	IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
-	struct perf_event	*event;
-	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
-	struct pmu			pmu;
-	unsigned int			msr;
-	u64				config_mask;
-	u64				cnt_mask;
-	u64				enable_mask;
-	u64				valid_mask;
-	u64				max_period;
-	unsigned long			offset_mask[1];
-	int				offset_max;
-	struct cpu_perf_ibs __percpu	*pcpu;
-
-	struct attribute		**format_attrs;
-	struct attribute_group		format_group;
-	const struct attribute_group	*attr_groups[2];
-
-	u64				(*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
-	u32		size;
-	union {
-		u32	data[0];	/* data buffer starts here */
-		u32	caps;
-	};
-	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
-	s64 left = local64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int overflow = 0;
-
-	/*
-	 * If we are way outside a reasonable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		overflow = 1;
-	}
-
-	if (unlikely(left < (s64)min)) {
-		left += period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		overflow = 1;
-	}
-
-	/*
-	 * If the hw period that triggers the sw overflow is too short
-	 * we might hit the irq handler. This biases the results.
-	 * Thus we shorten the next-to-last period and set the last
-	 * period to the max period.
-	 */
-	if (left > max) {
-		left -= max;
-		if (left > max)
-			left = max;
-		else if (left < min)
-			left = min;
-	}
-
-	*hw_period = (u64)left;
-
-	return overflow;
-}
-
-static  int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - width;
-	u64 prev_raw_count;
-	u64 delta;
-
-	/*
-	 * Careful: an NMI might modify the previous event value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic event atomically:
-	 */
-	prev_raw_count = local64_read(&hwc->prev_count);
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		return 0;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-	local64_sub(delta, &hwc->period_left);
-
-	return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
-	if (perf_ibs_fetch.pmu.type == type)
-		return &perf_ibs_fetch;
-	if (perf_ibs_op.pmu.type == type)
-		return &perf_ibs_op;
-	return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
- *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
- *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
-	switch (event->attr.precise_ip) {
-	case 0:
-		return -ENOENT;
-	case 1:
-	case 2:
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	switch (event->attr.type) {
-	case PERF_TYPE_HARDWARE:
-		switch (event->attr.config) {
-		case PERF_COUNT_HW_CPU_CYCLES:
-			*config = 0;
-			return 0;
-		}
-		break;
-	case PERF_TYPE_RAW:
-		switch (event->attr.config) {
-		case 0x0076:
-			*config = 0;
-			return 0;
-		case 0x00C1:
-			*config = IBS_OP_CNT_CTL;
-			return 0;
-		}
-		break;
-	default:
-		return -ENOENT;
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
-	.exclude_user	= 1,
-	.exclude_kernel	= 1,
-	.exclude_hv	= 1,
-	.exclude_idle	= 1,
-	.exclude_host	= 1,
-	.exclude_guest	= 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs;
-	u64 max_cnt, config;
-	int ret;
-
-	perf_ibs = get_ibs_pmu(event->attr.type);
-	if (perf_ibs) {
-		config = event->attr.config;
-	} else {
-		perf_ibs = &perf_ibs_op;
-		ret = perf_ibs_precise_event(event, &config);
-		if (ret)
-			return ret;
-	}
-
-	if (event->pmu != &perf_ibs->pmu)
-		return -ENOENT;
-
-	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
-		return -EINVAL;
-
-	if (config & ~perf_ibs->config_mask)
-		return -EINVAL;
-
-	if (hwc->sample_period) {
-		if (config & perf_ibs->cnt_mask)
-			/* raw max_cnt may not be set */
-			return -EINVAL;
-		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
-			/*
-			 * lower 4 bits can not be set in ibs max cnt,
-			 * but allowing it in case we adjust the
-			 * sample period to set a frequency.
-			 */
-			return -EINVAL;
-		hwc->sample_period &= ~0x0FULL;
-		if (!hwc->sample_period)
-			hwc->sample_period = 0x10;
-	} else {
-		max_cnt = config & perf_ibs->cnt_mask;
-		config &= ~perf_ibs->cnt_mask;
-		event->attr.sample_period = max_cnt << 4;
-		hwc->sample_period = event->attr.sample_period;
-	}
-
-	if (!hwc->sample_period)
-		return -EINVAL;
-
-	/*
-	 * If we modify hwc->sample_period, we also need to update
-	 * hwc->last_period and hwc->period_left.
-	 */
-	hwc->last_period = hwc->sample_period;
-	local64_set(&hwc->period_left, hwc->sample_period);
-
-	hwc->config_base = perf_ibs->msr;
-	hwc->config = config;
-
-	return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
-			       struct hw_perf_event *hwc, u64 *period)
-{
-	int overflow;
-
-	/* ignore lower 4 bits in min count: */
-	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
-	local64_set(&hwc->prev_count, 0);
-
-	return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
-	return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
-	u64 count = 0;
-
-	if (config & IBS_OP_VAL)
-		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
-	if (ibs_caps & IBS_CAPS_RDWROPCNT)
-		count += (config & IBS_OP_CUR_CNT) >> 32;
-
-	return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-		      u64 *config)
-{
-	u64 count = perf_ibs->get_count(*config);
-
-	/*
-	 * Set width to 64 since we do not overflow on max width but
-	 * instead on max count. In perf_ibs_set_period() we clear
-	 * prev count manually on overflow.
-	 */
-	while (!perf_event_try_update(event, count, 64)) {
-		rdmsrl(event->hw.config_base, *config);
-		count = perf_ibs->get_count(*config);
-	}
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
-					 struct hw_perf_event *hwc, u64 config)
-{
-	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
-					  struct hw_perf_event *hwc, u64 config)
-{
-	config &= ~perf_ibs->cnt_mask;
-	wrmsrl(hwc->config_base, config);
-	config &= ~perf_ibs->enable_mask;
-	wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 period;
-
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-	hwc->state = 0;
-
-	perf_ibs_set_period(perf_ibs, hwc, &period);
-	set_bit(IBS_STARTED, pcpu->state);
-	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-	perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 config;
-	int stopping;
-
-	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
-	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
-		return;
-
-	rdmsrl(hwc->config_base, config);
-
-	if (stopping) {
-		set_bit(IBS_STOPPING, pcpu->state);
-		perf_ibs_disable_event(perf_ibs, hwc, config);
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	/*
-	 * Clear valid bit to not count rollovers on update, rollovers
-	 * are only updated in the irq handler.
-	 */
-	config &= ~perf_ibs->valid_mask;
-
-	perf_ibs_event_update(perf_ibs, event, &config);
-	hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
-		return -ENOSPC;
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	pcpu->event = event;
-
-	if (flags & PERF_EF_START)
-		perf_ibs_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
-		return;
-
-	perf_ibs_stop(event, PERF_EF_UPDATE);
-
-	pcpu->event = NULL;
-
-	perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en,	"config:57");
-PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
-	&format_attr_rand_en.attr,
-	NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
-	NULL,	/* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
-	NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
-	.pmu = {
-		.task_ctx_nr	= perf_invalid_context,
-
-		.event_init	= perf_ibs_init,
-		.add		= perf_ibs_add,
-		.del		= perf_ibs_del,
-		.start		= perf_ibs_start,
-		.stop		= perf_ibs_stop,
-		.read		= perf_ibs_read,
-	},
-	.msr			= MSR_AMD64_IBSFETCHCTL,
-	.config_mask		= IBS_FETCH_CONFIG_MASK,
-	.cnt_mask		= IBS_FETCH_MAX_CNT,
-	.enable_mask		= IBS_FETCH_ENABLE,
-	.valid_mask		= IBS_FETCH_VAL,
-	.max_period		= IBS_FETCH_MAX_CNT << 4,
-	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
-	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
-	.format_attrs		= ibs_fetch_format_attrs,
-
-	.get_count		= get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
-	.pmu = {
-		.task_ctx_nr	= perf_invalid_context,
-
-		.event_init	= perf_ibs_init,
-		.add		= perf_ibs_add,
-		.del		= perf_ibs_del,
-		.start		= perf_ibs_start,
-		.stop		= perf_ibs_stop,
-		.read		= perf_ibs_read,
-	},
-	.msr			= MSR_AMD64_IBSOPCTL,
-	.config_mask		= IBS_OP_CONFIG_MASK,
-	.cnt_mask		= IBS_OP_MAX_CNT,
-	.enable_mask		= IBS_OP_ENABLE,
-	.valid_mask		= IBS_OP_VAL,
-	.max_period		= IBS_OP_MAX_CNT << 4,
-	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
-	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
-	.format_attrs		= ibs_op_format_attrs,
-
-	.get_count		= get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	struct perf_event *event = pcpu->event;
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	struct perf_ibs_data ibs_data;
-	int offset, size, check_rip, offset_max, throttle = 0;
-	unsigned int msr;
-	u64 *buf, *config, period;
-
-	if (!test_bit(IBS_STARTED, pcpu->state)) {
-		/*
-		 * Catch spurious interrupts after stopping IBS: After
-		 * disabling IBS there could be still incoming NMIs
-		 * with samples that even have the valid bit cleared.
-		 * Mark all this NMIs as handled.
-		 */
-		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
-	}
-
-	msr = hwc->config_base;
-	buf = ibs_data.regs;
-	rdmsrl(msr, *buf);
-	if (!(*buf++ & perf_ibs->valid_mask))
-		return 0;
-
-	config = &ibs_data.regs[0];
-	perf_ibs_event_update(perf_ibs, event, config);
-	perf_sample_data_init(&data, 0, hwc->last_period);
-	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-		goto out;	/* no sw counter overflow */
-
-	ibs_data.caps = ibs_caps;
-	size = 1;
-	offset = 1;
-	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
-	if (event->attr.sample_type & PERF_SAMPLE_RAW)
-		offset_max = perf_ibs->offset_max;
-	else if (check_rip)
-		offset_max = 2;
-	else
-		offset_max = 1;
-	do {
-		rdmsrl(msr + offset, *buf++);
-		size++;
-		offset = find_next_bit(perf_ibs->offset_mask,
-				       perf_ibs->offset_max,
-				       offset + 1);
-	} while (offset < offset_max);
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		/*
-		 * Read IbsBrTarget and IbsOpData4 separately
-		 * depending on their availability.
-		 * Can't add to offset_max as they are staggered
-		 */
-		if (ibs_caps & IBS_CAPS_BRNTRGT) {
-			rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
-			size++;
-		}
-		if (ibs_caps & IBS_CAPS_OPDATA4) {
-			rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
-			size++;
-		}
-	}
-	ibs_data.size = sizeof(u64) * size;
-
-	regs = *iregs;
-	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
-		regs.flags &= ~PERF_EFLAGS_EXACT;
-	} else {
-		set_linear_ip(&regs, ibs_data.regs[1]);
-		regs.flags |= PERF_EFLAGS_EXACT;
-	}
-
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = sizeof(u32) + ibs_data.size;
-		raw.data = ibs_data.data;
-		data.raw = &raw;
-	}
-
-	throttle = perf_event_overflow(event, &data, &regs);
-out:
-	if (throttle)
-		perf_ibs_disable_event(perf_ibs, hwc, *config);
-	else
-		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-	perf_event_update_userpage(event);
-
-	return 1;
-}
-
-static int
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-	int handled = 0;
-
-	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
-	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
-	struct cpu_perf_ibs __percpu *pcpu;
-	int ret;
-
-	pcpu = alloc_percpu(struct cpu_perf_ibs);
-	if (!pcpu)
-		return -ENOMEM;
-
-	perf_ibs->pcpu = pcpu;
-
-	/* register attributes */
-	if (perf_ibs->format_attrs[0]) {
-		memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
-		perf_ibs->format_group.name	= "format";
-		perf_ibs->format_group.attrs	= perf_ibs->format_attrs;
-
-		memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
-		perf_ibs->attr_groups[0]	= &perf_ibs->format_group;
-		perf_ibs->pmu.attr_groups	= perf_ibs->attr_groups;
-	}
-
-	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
-	if (ret) {
-		perf_ibs->pcpu = NULL;
-		free_percpu(pcpu);
-	}
-
-	return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
-	struct attribute **attr = ibs_op_format_attrs;
-
-	if (!ibs_caps)
-		return -ENODEV;	/* ibs not supported by the cpu */
-
-	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
-	if (ibs_caps & IBS_CAPS_OPCNT) {
-		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
-		*attr++ = &format_attr_cnt_ctl.attr;
-	}
-	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
-	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
-	return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
-	u32 caps;
-	unsigned int max_level;
-
-	if (!boot_cpu_has(X86_FEATURE_IBS))
-		return 0;
-
-	/* check IBS cpuid feature flags */
-	max_level = cpuid_eax(0x80000000);
-	if (max_level < IBS_CPUID_FEATURES)
-		return IBS_CAPS_DEFAULT;
-
-	caps = cpuid_eax(IBS_CPUID_FEATURES);
-	if (!(caps & IBS_CAPS_AVAIL))
-		/* cpuid flags not valid */
-		return IBS_CAPS_DEFAULT;
-
-	return caps;
-}
-
-u32 get_ibs_caps(void)
-{
-	return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
-	int offset;
-	u64 val;
-	int valid = 0;
-
-	preempt_disable();
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	if (!get_eilvt(offset)) {
-		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	valid = 1;
-out:
-	preempt_enable();
-
-	return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-	struct pci_dev *cpu_cfg;
-	int nodes;
-	u32 value = 0;
-
-	nodes = 0;
-	cpu_cfg = NULL;
-	do {
-		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
-					 cpu_cfg);
-		if (!cpu_cfg)
-			break;
-		++nodes;
-		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-				       | IBSCTL_LVT_OFFSET_VALID);
-		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-			pci_dev_put(cpu_cfg);
-			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
-				 value);
-			return -EINVAL;
-		}
-	} while (1);
-
-	if (!nodes) {
-		pr_debug("No CPU node configured for IBS\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static void force_ibs_eilvt_setup(void)
-{
-	int offset;
-	int ret;
-
-	preempt_disable();
-	/* find the next free available EILVT entry, skip offset 0 */
-	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-		if (get_eilvt(offset))
-			break;
-	}
-	preempt_enable();
-
-	if (offset == APIC_EILVT_NR_MAX) {
-		pr_debug("No EILVT entry available\n");
-		return;
-	}
-
-	ret = setup_ibs_ctl(offset);
-	if (ret)
-		goto out;
-
-	if (!ibs_eilvt_valid())
-		goto out;
-
-	pr_info("IBS: LVT offset %d assigned\n", offset);
-
-	return;
-out:
-	preempt_disable();
-	put_eilvt(offset);
-	preempt_enable();
-	return;
-}
-
-static void ibs_eilvt_setup(void)
-{
-	/*
-	 * Force LVT offset assignment for family 10h: The offsets are
-	 * not assigned by the BIOS for this family, so the OS is
-	 * responsible for doing it. If the OS assignment fails, fall
-	 * back to BIOS settings and try to setup this.
-	 */
-	if (boot_cpu_data.x86 == 0x10)
-		force_ibs_eilvt_setup();
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
-	u64 val;
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	if (!(val & IBSCTL_LVT_OFFSET_VALID))
-		return -EINVAL;
-
-	return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
-	int offset;
-
-	offset = get_ibs_lvt_offset();
-	if (offset < 0)
-		goto failed;
-
-	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-		return;
-failed:
-	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
-		smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
-	int offset;
-
-	offset = get_ibs_lvt_offset();
-	if (offset >= 0)
-		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-#ifdef CONFIG_PM
-
-static int perf_ibs_suspend(void)
-{
-	clear_APIC_ibs(NULL);
-	return 0;
-}
-
-static void perf_ibs_resume(void)
-{
-	ibs_eilvt_setup();
-	setup_APIC_ibs(NULL);
-}
-
-static struct syscore_ops perf_ibs_syscore_ops = {
-	.resume		= perf_ibs_resume,
-	.suspend	= perf_ibs_suspend,
-};
-
-static void perf_ibs_pm_init(void)
-{
-	register_syscore_ops(&perf_ibs_syscore_ops);
-}
-
-#else
-
-static inline void perf_ibs_pm_init(void) { }
-
-#endif
-
-static int
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
-		setup_APIC_ibs(NULL);
-		break;
-	case CPU_DYING:
-		clear_APIC_ibs(NULL);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
-	u32 caps;
-	int ret = -EINVAL;
-
-	caps = __get_ibs_caps();
-	if (!caps)
-		return -ENODEV;	/* ibs not supported by the cpu */
-
-	ibs_eilvt_setup();
-
-	if (!ibs_eilvt_valid())
-		goto out;
-
-	perf_ibs_pm_init();
-	cpu_notifier_register_begin();
-	ibs_caps = caps;
-	/* make ibs_caps visible to other cpus: */
-	smp_mb();
-	smp_call_function(setup_APIC_ibs, NULL, 1);
-	__perf_cpu_notifier(perf_ibs_cpu_notifier);
-	cpu_notifier_register_done();
-
-	ret = perf_event_ibs_init();
-out:
-	if (ret)
-		pr_err("Failed to setup IBS, %d\n", ret);
-	return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
-- 
cgit 


From 5b26547dd7faa84e1293baa144a0f3e74ed7d4c7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:07 +0100
Subject: perf/x86: Move perf_event_amd_iommu.[ch] .. =>
 x86/events/amd/iommu.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                   |   3 +
 arch/x86/events/amd/iommu.c                | 499 +++++++++++++++++++++++++++++
 arch/x86/events/amd/iommu.h                |  40 +++
 arch/x86/kernel/cpu/Makefile               |   3 -
 arch/x86/kernel/cpu/perf_event_amd_iommu.c | 499 -----------------------------
 arch/x86/kernel/cpu/perf_event_amd_iommu.h |  40 ---
 6 files changed, 542 insertions(+), 542 deletions(-)
 create mode 100644 arch/x86/events/amd/iommu.c
 create mode 100644 arch/x86/events/amd/iommu.h
 delete mode 100644 arch/x86/kernel/cpu/perf_event_amd_iommu.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_amd_iommu.h

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 88f787350bf0..838195d90961 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -2,3 +2,6 @@ obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
 obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
+endif
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644
index 000000000000..629bc700eb08
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../../kernel/cpu/perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT		16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+	struct pmu pmu;
+	u8 max_banks;
+	u8 max_counters;
+	u64 cntr_assign_mask;
+	raw_spinlock_t lock;
+	const struct attribute_group *attr_groups[4];
+};
+
+#define format_group	attr_groups[0]
+#define cpumask_group	attr_groups[1]
+#define events_group	attr_groups[2]
+#define null_group	attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+	&format_attr_csource.attr,
+	&format_attr_devid.attr,
+	&format_attr_pasid.attr,
+	&format_attr_domid.attr,
+	&format_attr_devid_mask.attr,
+	&format_attr_pasid_mask.attr,
+	&format_attr_domid_mask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+	.name = "format",
+	.attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+	struct kobj_attribute attr;
+	const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct amd_iommu_event_desc *event =
+		container_of(attr, struct amd_iommu_event_desc, attr);
+	return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)			\
+{								\
+	.attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),	\
+	.event = _event,					\
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+	AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+	AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+	AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+	AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+	AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+	{ /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+	.attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+	unsigned long flags;
+	int shift, bank, cntr, retval;
+	int max_banks = perf_iommu->max_banks;
+	int max_cntrs = perf_iommu->max_counters;
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+	for (bank = 0, shift = 0; bank < max_banks; bank++) {
+		for (cntr = 0; cntr < max_cntrs; cntr++) {
+			shift = bank + (bank*3) + cntr;
+			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+				continue;
+			} else {
+				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+				goto out;
+			}
+		}
+	}
+	retval = -ENOSPC;
+out:
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+					u8 bank, u8 cntr)
+{
+	unsigned long flags;
+	int max_banks, max_cntrs;
+	int shift = 0;
+
+	max_banks = perf_iommu->max_banks;
+	max_cntrs = perf_iommu->max_counters;
+
+	if ((bank > max_banks) || (cntr > max_cntrs))
+		return -EINVAL;
+
+	shift = bank + cntr + (bank*3);
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+	return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_amd_iommu *perf_iommu;
+	u64 config, config1;
+
+	/* test the event attr type check for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * IOMMU counters are shared across all cores.
+	 * Therefore, it does not support per-process mode.
+	 * Also, it does not support event sampling mode.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* IOMMU counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	perf_iommu = &__perf_iommu;
+
+	if (event->pmu != &perf_iommu->pmu)
+		return -ENOENT;
+
+	if (perf_iommu) {
+		config = event->attr.config;
+		config1 = event->attr.config1;
+	} else {
+		return -EINVAL;
+	}
+
+	/* integrate with iommu base devid (0000), assume one iommu */
+	perf_iommu->max_banks =
+		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+	perf_iommu->max_counters =
+		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+		return -EINVAL;
+
+	/* update the hw_perf_event struct with the iommu config data */
+	hwc->config = config;
+	hwc->extra_reg.config = config1;
+
+	return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+	u8 csource = _GET_CSOURCE(ev);
+	u16 devid = _GET_DEVID(ev);
+	u64 reg = 0ULL;
+
+	reg = csource;
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+	u64 reg = 0ULL;
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+			_GET_BANK(event), _GET_CNTR(event),
+			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	pr_debug("perf: amd_iommu:perf_iommu_start\n");
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	if (flags & PERF_EF_RELOAD) {
+		u64 prev_raw_count =  local64_read(&hwc->prev_count);
+		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+	}
+
+	perf_iommu_enable_event(event);
+	perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+	u64 count = 0ULL;
+	u64 prev_raw_count = 0ULL;
+	u64 delta = 0ULL;
+	struct hw_perf_event *hwc = &event->hw;
+	pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &count, false);
+
+	/* IOMMU pc counter register is only 48 bits */
+	count &= 0xFFFFFFFFFFFFULL;
+
+	prev_raw_count =  local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					count) != prev_raw_count)
+		return;
+
+	/* Handling 48-bit counter overflowing */
+	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	perf_iommu_disable_event(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	config = hwc->config;
+	perf_iommu_read(event);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+	int retval;
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_add\n");
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	/* request an iommu bank/counter */
+	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+	if (retval != -ENOSPC)
+		event->hw.extra_reg.reg = (u16)retval;
+	else
+		return retval;
+
+	if (flags & PERF_EF_START)
+		perf_iommu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_del\n");
+	perf_iommu_stop(event, PERF_EF_UPDATE);
+
+	/* clear the assigned iommu bank/counter */
+	clear_avail_iommu_bnk_cntr(perf_iommu,
+				     _GET_BANK(event),
+				     _GET_CNTR(event));
+
+	perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+	struct attribute **attrs;
+	struct attribute_group *attr_group;
+	int i = 0, j;
+
+	while (amd_iommu_v2_event_descs[i].attr.attr.name)
+		i++;
+
+	attr_group = kzalloc(sizeof(struct attribute *)
+		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+	if (!attr_group)
+		return -ENOMEM;
+
+	attrs = (struct attribute **)(attr_group + 1);
+	for (j = 0; j < i; j++)
+		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+	attr_group->name = "events";
+	attr_group->attrs = attrs;
+	perf_iommu->events_group = attr_group;
+
+	return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+	if (__perf_iommu.events_group != NULL) {
+		kfree(__perf_iommu.events_group);
+		__perf_iommu.events_group = NULL;
+	}
+}
+
+static __init int _init_perf_amd_iommu(
+	struct perf_amd_iommu *perf_iommu, char *name)
+{
+	int ret;
+
+	raw_spin_lock_init(&perf_iommu->lock);
+
+	/* Init format attributes */
+	perf_iommu->format_group = &amd_iommu_format_group;
+
+	/* Init cpumask attributes to only core 0 */
+	cpumask_set_cpu(0, &iommu_cpumask);
+	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+	/* Init events attributes */
+	if (_init_events_attrs(perf_iommu) != 0)
+		pr_err("perf: amd_iommu: Only support raw events.\n");
+
+	/* Init null attributes */
+	perf_iommu->null_group = NULL;
+	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+	if (ret) {
+		pr_err("perf: amd_iommu: Failed to initialized.\n");
+		amd_iommu_pc_exit();
+	} else {
+		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+	}
+
+	return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+	.pmu = {
+		.event_init	= perf_iommu_event_init,
+		.add		= perf_iommu_add,
+		.del		= perf_iommu_del,
+		.start		= perf_iommu_start,
+		.stop		= perf_iommu_stop,
+		.read		= perf_iommu_read,
+	},
+	.max_banks		= 0x00,
+	.max_counters		= 0x00,
+	.cntr_assign_mask	= 0ULL,
+	.format_group		= NULL,
+	.cpumask_group		= NULL,
+	.events_group		= NULL,
+	.null_group		= NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+	/* Make sure the IOMMU PC resource is available */
+	if (!amd_iommu_pc_supported())
+		return -ENODEV;
+
+	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+	return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644
index 000000000000..845d173278e3
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG			0x00
+#define IOMMU_PC_COUNTER_SRC_REG		0x08
+#define IOMMU_PC_PASID_MATCH_REG		0x10
+#define IOMMU_PC_DOMID_MATCH_REG		0x18
+#define IOMMU_PC_DEVID_MATCH_REG		0x20
+#define IOMMU_PC_COUNTER_REPORT_REG		0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS			64
+#define PC_MAX_SPEC_CNTRS			16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID			0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+			u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index dddba2225ccd..2e15d9d63756 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,9 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
-endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
deleted file mode 100644
index 97242a9242bd..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT		16
-
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
-	struct pmu pmu;
-	u8 max_banks;
-	u8 max_counters;
-	u64 cntr_assign_mask;
-	raw_spinlock_t lock;
-	const struct attribute_group *attr_groups[4];
-};
-
-#define format_group	attr_groups[0]
-#define cpumask_group	attr_groups[1]
-#define events_group	attr_groups[2]
-#define null_group	attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource,    "config:0-7");
-PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
-	&format_attr_csource.attr,
-	&format_attr_devid.attr,
-	&format_attr_pasid.attr,
-	&format_attr_domid.attr,
-	&format_attr_devid_mask.attr,
-	&format_attr_pasid_mask.attr,
-	&format_attr_domid_mask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
-	.name = "format",
-	.attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
-	struct kobj_attribute attr;
-	const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	struct amd_iommu_event_desc *event =
-		container_of(attr, struct amd_iommu_event_desc, attr);
-	return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event)			\
-{								\
-	.attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),	\
-	.event = _event,					\
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
-	AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
-	AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
-	AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
-	AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
-	AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
-	AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
-	AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
-	AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
-	AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
-	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
-	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
-	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
-	{ /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
-	.attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
-	unsigned long flags;
-	int shift, bank, cntr, retval;
-	int max_banks = perf_iommu->max_banks;
-	int max_cntrs = perf_iommu->max_counters;
-
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
-	for (bank = 0, shift = 0; bank < max_banks; bank++) {
-		for (cntr = 0; cntr < max_cntrs; cntr++) {
-			shift = bank + (bank*3) + cntr;
-			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
-				continue;
-			} else {
-				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
-				goto out;
-			}
-		}
-	}
-	retval = -ENOSPC;
-out:
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-	return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
-					u8 bank, u8 cntr)
-{
-	unsigned long flags;
-	int max_banks, max_cntrs;
-	int shift = 0;
-
-	max_banks = perf_iommu->max_banks;
-	max_cntrs = perf_iommu->max_counters;
-
-	if ((bank > max_banks) || (cntr > max_cntrs))
-		return -EINVAL;
-
-	shift = bank + cntr + (bank*3);
-
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-	perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
-	return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_amd_iommu *perf_iommu;
-	u64 config, config1;
-
-	/* test the event attr type check for PMU enumeration */
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/*
-	 * IOMMU counters are shared across all cores.
-	 * Therefore, it does not support per-process mode.
-	 * Also, it does not support event sampling mode.
-	 */
-	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-		return -EINVAL;
-
-	/* IOMMU counters do not have usr/os/guest/host bits */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-	    event->attr.exclude_host || event->attr.exclude_guest)
-		return -EINVAL;
-
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	perf_iommu = &__perf_iommu;
-
-	if (event->pmu != &perf_iommu->pmu)
-		return -ENOENT;
-
-	if (perf_iommu) {
-		config = event->attr.config;
-		config1 = event->attr.config1;
-	} else {
-		return -EINVAL;
-	}
-
-	/* integrate with iommu base devid (0000), assume one iommu */
-	perf_iommu->max_banks =
-		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-	perf_iommu->max_counters =
-		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-		return -EINVAL;
-
-	/* update the hw_perf_event struct with the iommu config data */
-	hwc->config = config;
-	hwc->extra_reg.config = config1;
-
-	return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
-	u8 csource = _GET_CSOURCE(ev);
-	u16 devid = _GET_DEVID(ev);
-	u64 reg = 0ULL;
-
-	reg = csource;
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
-	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
-	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
-	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
-	u64 reg = 0ULL;
-
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-			_GET_BANK(event), _GET_CNTR(event),
-			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	pr_debug("perf: amd_iommu:perf_iommu_start\n");
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-	hwc->state = 0;
-
-	if (flags & PERF_EF_RELOAD) {
-		u64 prev_raw_count =  local64_read(&hwc->prev_count);
-		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
-	}
-
-	perf_iommu_enable_event(event);
-	perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
-	u64 count = 0ULL;
-	u64 prev_raw_count = 0ULL;
-	u64 delta = 0ULL;
-	struct hw_perf_event *hwc = &event->hw;
-	pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &count, false);
-
-	/* IOMMU pc counter register is only 48 bits */
-	count &= 0xFFFFFFFFFFFFULL;
-
-	prev_raw_count =  local64_read(&hwc->prev_count);
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					count) != prev_raw_count)
-		return;
-
-	/* Handling 48-bit counter overflowing */
-	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
-	delta >>= COUNTER_SHIFT;
-	local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	perf_iommu_disable_event(event);
-	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-	hwc->state |= PERF_HES_STOPPED;
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	config = hwc->config;
-	perf_iommu_read(event);
-	hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
-	int retval;
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-	pr_debug("perf: amd_iommu:perf_iommu_add\n");
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	/* request an iommu bank/counter */
-	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-	if (retval != -ENOSPC)
-		event->hw.extra_reg.reg = (u16)retval;
-	else
-		return retval;
-
-	if (flags & PERF_EF_START)
-		perf_iommu_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-	pr_debug("perf: amd_iommu:perf_iommu_del\n");
-	perf_iommu_stop(event, PERF_EF_UPDATE);
-
-	/* clear the assigned iommu bank/counter */
-	clear_avail_iommu_bnk_cntr(perf_iommu,
-				     _GET_BANK(event),
-				     _GET_CNTR(event));
-
-	perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
-	struct attribute **attrs;
-	struct attribute_group *attr_group;
-	int i = 0, j;
-
-	while (amd_iommu_v2_event_descs[i].attr.attr.name)
-		i++;
-
-	attr_group = kzalloc(sizeof(struct attribute *)
-		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-	if (!attr_group)
-		return -ENOMEM;
-
-	attrs = (struct attribute **)(attr_group + 1);
-	for (j = 0; j < i; j++)
-		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
-	attr_group->name = "events";
-	attr_group->attrs = attrs;
-	perf_iommu->events_group = attr_group;
-
-	return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
-	if (__perf_iommu.events_group != NULL) {
-		kfree(__perf_iommu.events_group);
-		__perf_iommu.events_group = NULL;
-	}
-}
-
-static __init int _init_perf_amd_iommu(
-	struct perf_amd_iommu *perf_iommu, char *name)
-{
-	int ret;
-
-	raw_spin_lock_init(&perf_iommu->lock);
-
-	/* Init format attributes */
-	perf_iommu->format_group = &amd_iommu_format_group;
-
-	/* Init cpumask attributes to only core 0 */
-	cpumask_set_cpu(0, &iommu_cpumask);
-	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-	/* Init events attributes */
-	if (_init_events_attrs(perf_iommu) != 0)
-		pr_err("perf: amd_iommu: Only support raw events.\n");
-
-	/* Init null attributes */
-	perf_iommu->null_group = NULL;
-	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
-	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-	if (ret) {
-		pr_err("perf: amd_iommu: Failed to initialized.\n");
-		amd_iommu_pc_exit();
-	} else {
-		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
-	}
-
-	return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
-	.pmu = {
-		.event_init	= perf_iommu_event_init,
-		.add		= perf_iommu_add,
-		.del		= perf_iommu_del,
-		.start		= perf_iommu_start,
-		.stop		= perf_iommu_stop,
-		.read		= perf_iommu_read,
-	},
-	.max_banks		= 0x00,
-	.max_counters		= 0x00,
-	.cntr_assign_mask	= 0ULL,
-	.format_group		= NULL,
-	.cpumask_group		= NULL,
-	.events_group		= NULL,
-	.null_group		= NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
-	/* Make sure the IOMMU PC resource is available */
-	if (!amd_iommu_pc_supported())
-		return -ENODEV;
-
-	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
-	return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
deleted file mode 100644
index 845d173278e3..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG			0x00
-#define IOMMU_PC_COUNTER_SRC_REG		0x08
-#define IOMMU_PC_PASID_MATCH_REG		0x10
-#define IOMMU_PC_DOMID_MATCH_REG		0x18
-#define IOMMU_PC_DEVID_MATCH_REG		0x20
-#define IOMMU_PC_COUNTER_REPORT_REG		0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS			64
-#define PC_MAX_SPEC_CNTRS			16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID			0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-			u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
-- 
cgit 


From d0af1c0525d561fe3ab6d7a767cdd52704da25cd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:08 +0100
Subject: perf/x86: Move perf_event_amd_uncore.c .... =>
 x86/events/amd/uncore.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                    |   2 +-
 arch/x86/events/amd/uncore.c                | 601 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                |   1 -
 arch/x86/kernel/cpu/perf_event_amd_uncore.c | 601 ----------------------------
 4 files changed, 602 insertions(+), 603 deletions(-)
 create mode 100644 arch/x86/events/amd/uncore.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_amd_uncore.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 838195d90961..7d1ecff583b0 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,6 @@
 obj-y			+= core.o
 
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
 obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644
index 000000000000..19a17363a21d
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB		4
+#define NUM_COUNTERS_L2		4
+#define MAX_COUNTERS		NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB		6
+#define RDPMC_BASE_L2		10
+
+#define COUNTER_SHIFT		16
+
+struct amd_uncore {
+	int id;
+	int refcnt;
+	int cpu;
+	int num_counters;
+	int rdpmc_base;
+	u32 msr_base;
+	cpumask_t *active_mask;
+	struct pmu *pmu;
+	struct perf_event *events[MAX_COUNTERS];
+	struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+	if (is_nb_event(event) && amd_uncore_nb)
+		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+	else if (is_l2_event(event) && amd_uncore_l2)
+		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+	return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev, new;
+	s64 delta;
+
+	/*
+	 * since we do not enable counter overflow interrupts,
+	 * we do not have to worry about prev_count changing on us
+	 */
+
+	prev = local64_read(&hwc->prev_count);
+	rdpmcl(hwc->event_base_rdpmc, new);
+	local64_set(&hwc->prev_count, new);
+	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_RELOAD)
+		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+	hwc->state = 0;
+	wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+	perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		amd_uncore_read(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* are we already assigned? */
+	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+		goto out;
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (uncore->events[i] == event) {
+			hwc->idx = i;
+			goto out;
+		}
+	}
+
+	/* if not, take the first available counter */
+	hwc->idx = -1;
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+			hwc->idx = i;
+			break;
+		}
+	}
+
+out:
+	if (hwc->idx == -1)
+		return -EBUSY;
+
+	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		amd_uncore_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	amd_uncore_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], event, NULL) == event)
+			break;
+	}
+
+	hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+	struct amd_uncore *uncore;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * NB and L2 counters (MSRs) are shared across all cores that share the
+	 * same NB / L2 cache. Interrupts can be directed to a single target
+	 * core, however, event counts generated by processes running on other
+	 * cores cannot be masked out. So we do not support sampling and
+	 * per-thread events.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* NB and L2 counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	/* and we do not enable counter overflow interrupts */
+	hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+	hwc->idx = -1;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	uncore = event_to_amd_uncore(event);
+	if (!uncore)
+		return -ENODEV;
+
+	/*
+	 * since request can come in to any of the shared cores, we will remap
+	 * to a single common cpu.
+	 */
+	event->cpu = uncore->cpu;
+
+	return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	cpumask_t *active_mask;
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu->type == amd_nb_pmu.type)
+		active_mask = &amd_nb_active_mask;
+	else if (pmu->type == amd_l2_pmu.type)
+		active_mask = &amd_l2_active_mask;
+	else
+		return 0;
+
+	return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+	.attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+	.name = "format",
+	.attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+	&amd_uncore_attr_group,
+	&amd_uncore_format_group,
+	NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_nb",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_l2",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+			cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+	if (amd_uncore_nb) {
+		uncore_nb = amd_uncore_alloc(cpu);
+		if (!uncore_nb)
+			goto fail;
+		uncore_nb->cpu = cpu;
+		uncore_nb->num_counters = NUM_COUNTERS_NB;
+		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+		uncore_nb->active_mask = &amd_nb_active_mask;
+		uncore_nb->pmu = &amd_nb_pmu;
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+	}
+
+	if (amd_uncore_l2) {
+		uncore_l2 = amd_uncore_alloc(cpu);
+		if (!uncore_l2)
+			goto fail;
+		uncore_l2->cpu = cpu;
+		uncore_l2->num_counters = NUM_COUNTERS_L2;
+		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+		uncore_l2->active_mask = &amd_l2_active_mask;
+		uncore_l2->pmu = &amd_l2_pmu;
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+	}
+
+	return 0;
+
+fail:
+	kfree(uncore_nb);
+	return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+			       struct amd_uncore * __percpu *uncores)
+{
+	unsigned int cpu;
+	struct amd_uncore *that;
+
+	for_each_online_cpu(cpu) {
+		that = *per_cpu_ptr(uncores, cpu);
+
+		if (!that)
+			continue;
+
+		if (this == that)
+			continue;
+
+		if (this->id == that->id) {
+			that->free_when_cpu_online = this;
+			this = that;
+			break;
+		}
+	}
+
+	this->refcnt++;
+	return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+	unsigned int eax, ebx, ecx, edx;
+	struct amd_uncore *uncore;
+
+	if (amd_uncore_nb) {
+		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+		uncore->id = ecx & 0xff;
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+	}
+
+	if (amd_uncore_l2) {
+		unsigned int apicid = cpu_data(cpu).apicid;
+		unsigned int nshared;
+
+		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+		nshared = ((eax >> 14) & 0xfff) + 1;
+		uncore->id = apicid - (apicid % nshared);
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+	}
+}
+
+static void uncore_online(unsigned int cpu,
+			  struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	kfree(uncore->free_when_cpu_online);
+	uncore->free_when_cpu_online = NULL;
+
+	if (cpu == uncore->cpu)
+		cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_online(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+				struct amd_uncore * __percpu *uncores)
+{
+	unsigned int i;
+	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+	if (this->cpu != cpu)
+		return;
+
+	/* this cpu is going down, migrate to a shared sibling if possible */
+	for_each_online_cpu(i) {
+		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+		if (cpu == i)
+			continue;
+
+		if (this == that) {
+			perf_pmu_migrate_context(this->pmu, cpu, i);
+			cpumask_clear_cpu(cpu, that->active_mask);
+			cpumask_set_cpu(i, that->active_mask);
+			that->cpu = i;
+			break;
+		}
+	}
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_down_prepare(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	if (cpu == uncore->cpu)
+		cpumask_clear_cpu(cpu, uncore->active_mask);
+
+	if (!--uncore->refcnt)
+		kfree(uncore);
+	*per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_dead(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+			void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		if (amd_uncore_cpu_up_prepare(cpu))
+			return notifier_from_errno(-ENOMEM);
+		break;
+
+	case CPU_STARTING:
+		amd_uncore_cpu_starting(cpu);
+		break;
+
+	case CPU_ONLINE:
+		amd_uncore_cpu_online(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		amd_uncore_cpu_down_prepare(cpu);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		amd_uncore_cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+	.notifier_call	= amd_uncore_cpu_notifier,
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_starting(cpu);
+	amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+	unsigned int cpu, cpu2;
+	int ret = -ENODEV;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		goto fail_nodev;
+
+	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+		goto fail_nodev;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_nb) {
+			ret = -ENOMEM;
+			goto fail_nb;
+		}
+		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+		if (ret)
+			goto fail_nb;
+
+		pr_info("perf: AMD NB counters detected\n");
+		ret = 0;
+	}
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_l2) {
+			ret = -ENOMEM;
+			goto fail_l2;
+		}
+		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+		if (ret)
+			goto fail_l2;
+
+		pr_info("perf: AMD L2I counters detected\n");
+		ret = 0;
+	}
+
+	if (ret)
+		goto fail_nodev;
+
+	cpu_notifier_register_begin();
+
+	/* init cpus already online before registering for hotplug notifier */
+	for_each_online_cpu(cpu) {
+		ret = amd_uncore_cpu_up_prepare(cpu);
+		if (ret)
+			goto fail_online;
+		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+	}
+
+	__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+	cpu_notifier_register_done();
+
+	return 0;
+
+
+fail_online:
+	for_each_online_cpu(cpu2) {
+		if (cpu2 == cpu)
+			break;
+		smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+	}
+	cpu_notifier_register_done();
+
+	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+	amd_uncore_nb = amd_uncore_l2 = NULL;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+		perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+		perf_pmu_unregister(&amd_nb_pmu);
+	if (amd_uncore_l2)
+		free_percpu(amd_uncore_l2);
+fail_nb:
+	if (amd_uncore_nb)
+		free_percpu(amd_uncore_nb);
+
+fail_nodev:
+	return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 2e15d9d63756..7edbeb945be5 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,6 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
deleted file mode 100644
index 19a17363a21d..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ /dev/null
@@ -1,601 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB		4
-#define NUM_COUNTERS_L2		4
-#define MAX_COUNTERS		NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB		6
-#define RDPMC_BASE_L2		10
-
-#define COUNTER_SHIFT		16
-
-struct amd_uncore {
-	int id;
-	int refcnt;
-	int cpu;
-	int num_counters;
-	int rdpmc_base;
-	u32 msr_base;
-	cpumask_t *active_mask;
-	struct pmu *pmu;
-	struct perf_event *events[MAX_COUNTERS];
-	struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
-	if (is_nb_event(event) && amd_uncore_nb)
-		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-	else if (is_l2_event(event) && amd_uncore_l2)
-		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
-	return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev, new;
-	s64 delta;
-
-	/*
-	 * since we do not enable counter overflow interrupts,
-	 * we do not have to worry about prev_count changing on us
-	 */
-
-	prev = local64_read(&hwc->prev_count);
-	rdpmcl(hwc->event_base_rdpmc, new);
-	local64_set(&hwc->prev_count, new);
-	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-	delta >>= COUNTER_SHIFT;
-	local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (flags & PERF_EF_RELOAD)
-		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
-	hwc->state = 0;
-	wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
-	perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-	hwc->state |= PERF_HES_STOPPED;
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		amd_uncore_read(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
-	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	/* are we already assigned? */
-	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
-		goto out;
-
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (uncore->events[i] == event) {
-			hwc->idx = i;
-			goto out;
-		}
-	}
-
-	/* if not, take the first available counter */
-	hwc->idx = -1;
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
-			hwc->idx = i;
-			break;
-		}
-	}
-
-out:
-	if (hwc->idx == -1)
-		return -EBUSY;
-
-	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	if (flags & PERF_EF_START)
-		amd_uncore_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
-	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	amd_uncore_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], event, NULL) == event)
-			break;
-	}
-
-	hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
-	struct amd_uncore *uncore;
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/*
-	 * NB and L2 counters (MSRs) are shared across all cores that share the
-	 * same NB / L2 cache. Interrupts can be directed to a single target
-	 * core, however, event counts generated by processes running on other
-	 * cores cannot be masked out. So we do not support sampling and
-	 * per-thread events.
-	 */
-	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-		return -EINVAL;
-
-	/* NB and L2 counters do not have usr/os/guest/host bits */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-	    event->attr.exclude_host || event->attr.exclude_guest)
-		return -EINVAL;
-
-	/* and we do not enable counter overflow interrupts */
-	hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
-	hwc->idx = -1;
-
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	uncore = event_to_amd_uncore(event);
-	if (!uncore)
-		return -ENODEV;
-
-	/*
-	 * since request can come in to any of the shared cores, we will remap
-	 * to a single common cpu.
-	 */
-	event->cpu = uncore->cpu;
-
-	return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
-					    struct device_attribute *attr,
-					    char *buf)
-{
-	cpumask_t *active_mask;
-	struct pmu *pmu = dev_get_drvdata(dev);
-
-	if (pmu->type == amd_nb_pmu.type)
-		active_mask = &amd_nb_active_mask;
-	else if (pmu->type == amd_l2_pmu.type)
-		active_mask = &amd_l2_active_mask;
-	else
-		return 0;
-
-	return cpumap_print_to_pagebuf(true, buf, active_mask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
-	.attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-	.name = "format",
-	.attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-	&amd_uncore_attr_group,
-	&amd_uncore_format_group,
-	NULL,
-};
-
-static struct pmu amd_nb_pmu = {
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_nb",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_l2",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-};
-
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
-{
-	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-			cpu_to_node(cpu));
-}
-
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
-	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
-
-	if (amd_uncore_nb) {
-		uncore_nb = amd_uncore_alloc(cpu);
-		if (!uncore_nb)
-			goto fail;
-		uncore_nb->cpu = cpu;
-		uncore_nb->num_counters = NUM_COUNTERS_NB;
-		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-		uncore_nb->active_mask = &amd_nb_active_mask;
-		uncore_nb->pmu = &amd_nb_pmu;
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
-	}
-
-	if (amd_uncore_l2) {
-		uncore_l2 = amd_uncore_alloc(cpu);
-		if (!uncore_l2)
-			goto fail;
-		uncore_l2->cpu = cpu;
-		uncore_l2->num_counters = NUM_COUNTERS_L2;
-		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-		uncore_l2->active_mask = &amd_l2_active_mask;
-		uncore_l2->pmu = &amd_l2_pmu;
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
-	}
-
-	return 0;
-
-fail:
-	kfree(uncore_nb);
-	return -ENOMEM;
-}
-
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-			       struct amd_uncore * __percpu *uncores)
-{
-	unsigned int cpu;
-	struct amd_uncore *that;
-
-	for_each_online_cpu(cpu) {
-		that = *per_cpu_ptr(uncores, cpu);
-
-		if (!that)
-			continue;
-
-		if (this == that)
-			continue;
-
-		if (this->id == that->id) {
-			that->free_when_cpu_online = this;
-			this = that;
-			break;
-		}
-	}
-
-	this->refcnt++;
-	return this;
-}
-
-static void amd_uncore_cpu_starting(unsigned int cpu)
-{
-	unsigned int eax, ebx, ecx, edx;
-	struct amd_uncore *uncore;
-
-	if (amd_uncore_nb) {
-		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		uncore->id = ecx & 0xff;
-
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
-	}
-
-	if (amd_uncore_l2) {
-		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared;
-
-		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
-		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-		nshared = ((eax >> 14) & 0xfff) + 1;
-		uncore->id = apicid - (apicid % nshared);
-
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
-	}
-}
-
-static void uncore_online(unsigned int cpu,
-			  struct amd_uncore * __percpu *uncores)
-{
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-	kfree(uncore->free_when_cpu_online);
-	uncore->free_when_cpu_online = NULL;
-
-	if (cpu == uncore->cpu)
-		cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void amd_uncore_cpu_online(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_online(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_online(cpu, amd_uncore_l2);
-}
-
-static void uncore_down_prepare(unsigned int cpu,
-				struct amd_uncore * __percpu *uncores)
-{
-	unsigned int i;
-	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
-	if (this->cpu != cpu)
-		return;
-
-	/* this cpu is going down, migrate to a shared sibling if possible */
-	for_each_online_cpu(i) {
-		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
-		if (cpu == i)
-			continue;
-
-		if (this == that) {
-			perf_pmu_migrate_context(this->pmu, cpu, i);
-			cpumask_clear_cpu(cpu, that->active_mask);
-			cpumask_set_cpu(i, that->active_mask);
-			that->cpu = i;
-			break;
-		}
-	}
-}
-
-static void amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_down_prepare(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
-{
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-	if (cpu == uncore->cpu)
-		cpumask_clear_cpu(cpu, uncore->active_mask);
-
-	if (!--uncore->refcnt)
-		kfree(uncore);
-	*per_cpu_ptr(uncores, cpu) = NULL;
-}
-
-static void amd_uncore_cpu_dead(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_dead(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
-			void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		if (amd_uncore_cpu_up_prepare(cpu))
-			return notifier_from_errno(-ENOMEM);
-		break;
-
-	case CPU_STARTING:
-		amd_uncore_cpu_starting(cpu);
-		break;
-
-	case CPU_ONLINE:
-		amd_uncore_cpu_online(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-		amd_uncore_cpu_down_prepare(cpu);
-		break;
-
-	case CPU_UP_CANCELED:
-	case CPU_DEAD:
-		amd_uncore_cpu_dead(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block = {
-	.notifier_call	= amd_uncore_cpu_notifier,
-	.priority	= CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
-	unsigned int cpu = smp_processor_id();
-
-	amd_uncore_cpu_starting(cpu);
-	amd_uncore_cpu_online(cpu);
-}
-
-static void cleanup_cpu_online(void *dummy)
-{
-	unsigned int cpu = smp_processor_id();
-
-	amd_uncore_cpu_dead(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
-	unsigned int cpu, cpu2;
-	int ret = -ENODEV;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-		goto fail_nodev;
-
-	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-		goto fail_nodev;
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_nb) {
-			ret = -ENOMEM;
-			goto fail_nb;
-		}
-		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-		if (ret)
-			goto fail_nb;
-
-		pr_info("perf: AMD NB counters detected\n");
-		ret = 0;
-	}
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_l2) {
-			ret = -ENOMEM;
-			goto fail_l2;
-		}
-		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-		if (ret)
-			goto fail_l2;
-
-		pr_info("perf: AMD L2I counters detected\n");
-		ret = 0;
-	}
-
-	if (ret)
-		goto fail_nodev;
-
-	cpu_notifier_register_begin();
-
-	/* init cpus already online before registering for hotplug notifier */
-	for_each_online_cpu(cpu) {
-		ret = amd_uncore_cpu_up_prepare(cpu);
-		if (ret)
-			goto fail_online;
-		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
-	}
-
-	__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
-	cpu_notifier_register_done();
-
-	return 0;
-
-
-fail_online:
-	for_each_online_cpu(cpu2) {
-		if (cpu2 == cpu)
-			break;
-		smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
-	}
-	cpu_notifier_register_done();
-
-	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
-	amd_uncore_nb = amd_uncore_l2 = NULL;
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
-		perf_pmu_unregister(&amd_l2_pmu);
-fail_l2:
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-		perf_pmu_unregister(&amd_nb_pmu);
-	if (amd_uncore_l2)
-		free_percpu(amd_uncore_l2);
-fail_nb:
-	if (amd_uncore_nb)
-		free_percpu(amd_uncore_nb);
-
-fail_nodev:
-	return ret;
-}
-device_initcall(amd_uncore_init);
-- 
cgit 


From 8dd5032d9c540111dd673078738d137a998d6c3f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 7 Feb 2016 22:51:27 +0100
Subject: x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f ab 3e          lock bts %rdi,(%rsi)
         72 04                   jb     <test_and_set_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_set_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  test_and_clear_bit (124 copies, ~1000 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         72 04                   jb     <test_and_clear_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_clear_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  change_bit (3 copies, 8 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f bb 3e          lock btc %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

  clear_bit_unlock (2 copies, 11 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

      text     data      bss       dec    filename
  92110727 20826144 36417536 149354407    vmlinux.before
  92097234 20826176 36417536 149340946    vmlinux.after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bitops.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b954d5e4..7766d1cf096e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	__clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
 	if (IS_IMMEDIATE(nr)) {
 		asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
 	int r;
 
-- 
cgit 


From 5f9c01aa7c49a2d74474d6d879a797b8badf29e6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:29 +0100
Subject: x86/microcode: Untangle from BLK_DEV_INITRD

Thomas Voegtle reported that doing oldconfig with a .config which has
CONFIG_MICROCODE enabled but BLK_DEV_INITRD disabled prevents the
microcode loading mechanism from being built.

So untangle it from the BLK_DEV_INITRD dependency so that oldconfig
doesn't turn it off and add an explanatory text to its Kconfig help what
the supported methods for supplying microcode are.

Reported-by: Thomas Voegtle <tv@lio96.de>
Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.4
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                      | 27 ++++++++++++++-------------
 arch/x86/include/asm/microcode.h      | 26 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/microcode/intel.c | 14 ++++----------
 3 files changed, 44 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9af2e6338400..405b1858134b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1159,22 +1159,23 @@ config MICROCODE
 	bool "CPU microcode loading support"
 	default y
 	depends on CPU_SUP_AMD || CPU_SUP_INTEL
-	depends on BLK_DEV_INITRD
 	select FW_LOADER
 	---help---
-
 	  If you say Y here, you will be able to update the microcode on
-	  certain Intel and AMD processors. The Intel support is for the
-	  IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
-	  Xeon etc. The AMD support is for families 0x10 and later. You will
-	  obviously need the actual microcode binary data itself which is not
-	  shipped with the Linux kernel.
-
-	  This option selects the general module only, you need to select
-	  at least one vendor specific module as well.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called microcode.
+	  Intel and AMD processors. The Intel support is for the IA32 family,
+	  e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+	  AMD support is for families 0x10 and later. You will obviously need
+	  the actual microcode binary data itself which is not shipped with
+	  the Linux kernel.
+
+	  The preferred method to load microcode from a detached initrd is described
+	  in Documentation/x86/early-microcode.txt. For that you need to enable
+	  CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+	  initrd for microcode blobs.
+
+	  In addition, you can build-in the microcode into the kernel. For that you
+	  need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
+	  to the CONFIG_EXTRA_FIRMWARE config option.
 
 config MICROCODE_INTEL
 	bool "Intel microcode loading support"
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 1e1b07a5a738..9d3a96c4da78 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -3,6 +3,7 @@
 
 #include <asm/cpu.h>
 #include <linux/earlycpio.h>
+#include <linux/initrd.h>
 
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void)			{ }
 static inline bool
 get_builtin_firmware(struct cpio_data *cd, const char *name)	{ return false; }
 #endif
+
+static inline unsigned long get_initrd_start(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	return initrd_start;
+#else
+	return 0;
+#endif
+}
+
+static inline unsigned long get_initrd_start_addr(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_X86_32
+	unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+
+	return (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+	return get_initrd_start();
+#endif
+#else /* CONFIG_BLK_DEV_INITRD */
+	return 0;
+#endif
+}
+
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ee81c544ee0d..044bbbbcbaf1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -690,7 +690,7 @@ int __init save_microcode_in_initrd_intel(void)
 	if (count == 0)
 		return ret;
 
-	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
 		pr_err("Cannot save microcode patches from initrd.\n");
@@ -748,20 +748,14 @@ void load_ucode_intel_ap(void)
 	struct mc_saved_data *mc_saved_data_p;
 	struct ucode_cpu_info uci;
 	unsigned long *mc_saved_in_initrd_p;
-	unsigned long initrd_start_addr;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
-	unsigned long *initrd_start_p;
 
-	mc_saved_in_initrd_p =
-		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
 	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
-	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
 #else
-	mc_saved_data_p = &mc_saved_data;
 	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	initrd_start_addr = initrd_start;
+	mc_saved_data_p = &mc_saved_data;
 #endif
 
 	/*
@@ -773,7 +767,7 @@ void load_ucode_intel_ap(void)
 
 	collect_cpu_info_early(&uci);
 	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-			     initrd_start_addr, &uci);
+			     get_initrd_start_addr(), &uci);
 
 	if (ret != UCODE_OK)
 		return;
-- 
cgit 


From 264285ac01673e70557c43ecee338ce97c4c0672 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:30 +0100
Subject: x86/microcode/intel: Make early loader look for builtin microcode too

Set the initrd @start depending on the presence of an initrd. Otherwise,
builtin microcode loading doesn't work as the start is wrong and we're
using it to compute offset to the microcode blobs.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.4
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 044bbbbcbaf1..4f4735bd8698 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -551,10 +551,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	cd.data = NULL;
 	cd.size = 0;
 
-	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data) {
+	/* try built-in microcode if no initrd */
+	if (!size) {
 		if (!load_builtin_intel_microcode(&cd))
 			return UCODE_ERROR;
+	} else {
+		cd = find_cpio_data(p, (void *)start, size, &offset);
+		if (!cd.data)
+			return UCODE_ERROR;
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
@@ -728,16 +732,20 @@ void __init load_ucode_intel_bsp(void)
 	struct boot_params *p;
 
 	p	= (struct boot_params *)__pa_nodebug(&boot_params);
-	start	= p->hdr.ramdisk_image;
 	size	= p->hdr.ramdisk_size;
 
-	_load_ucode_intel_bsp(
-			(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-			(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-			start, size);
+	/*
+	 * Set start only if we have an initrd image. We cannot use initrd_start
+	 * because it is not set that early yet.
+	 */
+	start	= (size ? p->hdr.ramdisk_image : 0);
+
+	_load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+			      (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			      start, size);
 #else
-	start	= boot_params.hdr.ramdisk_image + PAGE_OFFSET;
 	size	= boot_params.hdr.ramdisk_size;
+	start	= (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
 
 	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
 #endif
-- 
cgit 


From e8c8165ecfb1cfd6650777c193361d33b0f7f59e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:31 +0100
Subject: x86/microcode: Remove redundant __setup() param parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We do parse for the disable microcode loader chicken bit very early.
After the driver merge, the __setup() param parsing method is not needed
anymore so get rid of it.

In addition, fix a compiler warning from an old SLES11 gcc (4.3.4)
reported by Jan Beulich <jbeulich@suse.com>:

  arch/x86/kernel/cpu/microcode/core.c: In function ‘load_ucode_bsp’:
  arch/x86/kernel/cpu/microcode/core.c:96: warning: array subscript is above array bounds

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index faec7120c508..bca4e48b531d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -43,16 +43,8 @@
 #define MICROCODE_VERSION	"2.01"
 
 static struct microcode_ops	*microcode_ops;
-
 static bool dis_ucode_ldr;
 
-static int __init disable_loader(char *str)
-{
-	dis_ucode_ldr = true;
-	return 1;
-}
-__setup("dis_ucode_ldr", disable_loader);
-
 /*
  * Synchronization.
  *
@@ -81,15 +73,16 @@ struct cpu_info_ctx {
 
 static bool __init check_loader_disabled_bsp(void)
 {
+	static const char *__dis_opt_str = "dis_ucode_ldr";
+
 #ifdef CONFIG_X86_32
 	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
-	const char *opt	    = "dis_ucode_ldr";
-	const char *option  = (const char *)__pa_nodebug(opt);
+	const char *option  = (const char *)__pa_nodebug(__dis_opt_str);
 	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
 
 #else /* CONFIG_X86_64 */
 	const char *cmdline = boot_command_line;
-	const char *option  = "dis_ucode_ldr";
+	const char *option  = __dis_opt_str;
 	bool *res = &dis_ucode_ldr;
 #endif
 
-- 
cgit 


From 43858f57bc02388a6420321c021397591199cf91 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 3 Feb 2016 12:33:32 +0100
Subject: x86/microcode: Remove an unneeded NULL check

"uci" is an element of the ucode_cpu_info[] array, it can't be NULL.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: http://lkml.kernel.org/r/1454499225-21544-5-git-send-email-bp@alien8.de
Link: http://lkml.kernel.org/r/20140120103046.GC14233@elgon.mountain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index bca4e48b531d..cea8552e2b3a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -472,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
 	enum ucode_state ustate;
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-	if (uci && uci->valid)
+	if (uci->valid)
 		return UCODE_OK;
 
 	if (collect_cpu_info(cpu))
-- 
cgit 


From b7f500aedd4551a9bf29c617804c13f0ff18c879 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:33 +0100
Subject: x86/microcode: Issue update message only once

This is especially annoying on large boxes:

  x86: Booting SMP configuration:
  .... node  #0, CPUs:          #1
  microcode: CPU1 microcode updated early to revision 0x428, date = 2014-05-29
     #2
  microcode: CPU2 microcode updated early to revision 0x428, date = 2014-05-29
     #3
  ...

so issue the update message only once.

$ grep microcode /proc/cpuinfo

shows whether every core got updated properly.

Reported-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c   |  4 ++--
 arch/x86/kernel/cpu/microcode/intel.c | 13 +++++--------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a76615..5b63e2f669b0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -432,8 +432,8 @@ int __init save_microcode_in_initrd_amd(void)
 		container = cont_va;
 
 	if (ucode_new_rev)
-		pr_info("microcode: updated early to new patch_level=0x%08x\n",
-			ucode_new_rev);
+		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+			     ucode_new_rev);
 
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 4f4735bd8698..f4bc5fe00d46 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -571,14 +571,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 static void
 print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
 {
-	int cpu = smp_processor_id();
-
-	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu,
-		uci->cpu_sig.rev,
-		date & 0xffff,
-		date >> 24,
-		(date >> 16) & 0xff);
+	pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+		     uci->cpu_sig.rev,
+		     date & 0xffff,
+		     date >> 24,
+		     (date >> 16) & 0xff);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit 


From a58017c62b5fa23e532b731e70a63ded54cc2c02 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:34 +0100
Subject: x86/microcode/AMD: Drop redundant printk prefix

It is supplied by pr_fmt already.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 5b63e2f669b0..9f5ccef33fba 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -469,8 +469,7 @@ void reload_ucode_amd(void)
 	if (mc && rev < mc->hdr.patch_id) {
 		if (!__apply_microcode_amd(mc)) {
 			ucode_new_rev = mc->hdr.patch_id;
-			pr_info("microcode: reload patch_level=0x%08x\n",
-				ucode_new_rev);
+			pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
 		}
 	}
 }
-- 
cgit 


From bd6fe58d8e60d45c80ed664a55ef0df5fa076014 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:35 +0100
Subject: x86/microcode/intel: Rename local variables of type struct
 mc_saved_data

So it is always a head-twister when trying to stare at code which has a
bunch of

  struct mc_saved_data *mc_saved_data;

local function variables *and* a global mc_saved_data of the same name.

Rename all locals to "mcs" to differentiate from the global one.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 58 ++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f4bc5fe00d46..4af30bee2161 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -89,42 +89,39 @@ copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
 
 #ifdef CONFIG_X86_32
 static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
-	       struct mc_saved_data *mc_saved_data)
+microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 {
 	int i;
 	struct microcode_intel ***mc_saved;
 
-	mc_saved = (struct microcode_intel ***)
-		   __pa_nodebug(&mc_saved_data->mc_saved);
-	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+
+	for (i = 0; i < mcs->mc_saved_count; i++) {
 		struct microcode_intel *p;
 
-		p = *(struct microcode_intel **)
-			__pa_nodebug(mc_saved_data->mc_saved + i);
+		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
 		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
 	}
 }
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data->mc_saved_count;
+	unsigned int count = mcs->mc_saved_count;
 
-	if (!mc_saved_data->mc_saved) {
+	if (!mcs->mc_saved) {
 		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
 
 		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
 #ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mc_saved_data);
+		microcode_phys(mc_saved_tmp, mcs);
 		return load_microcode_early(mc_saved_tmp, count, uci);
 #else
-		return load_microcode_early(mc_saved_data->mc_saved,
-						    count, uci);
+		return load_microcode_early(mcs->mc_saved, count, uci);
 #endif
 	}
 }
@@ -175,7 +172,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 }
 
 static int
-save_microcode(struct mc_saved_data *mc_saved_data,
+save_microcode(struct mc_saved_data *mcs,
 	       struct microcode_intel **mc_saved_src,
 	       unsigned int mc_saved_count)
 {
@@ -219,8 +216,8 @@ save_microcode(struct mc_saved_data *mc_saved_data,
 	/*
 	 * Point to newly saved microcode.
 	 */
-	mc_saved_data->mc_saved = saved_ptr;
-	mc_saved_data->mc_saved_count = mc_saved_count;
+	mcs->mc_saved	    = saved_ptr;
+	mcs->mc_saved_count = mc_saved_count;
 
 	return 0;
 
@@ -286,7 +283,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 static enum ucode_state __init
 get_matching_model_microcode(int cpu, unsigned long start,
 			     void *data, size_t size,
-			     struct mc_saved_data *mc_saved_data,
+			     struct mc_saved_data *mcs,
 			     unsigned long *mc_saved_in_initrd,
 			     struct ucode_cpu_info *uci)
 {
@@ -296,7 +293,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_size;
 	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+	unsigned int mc_saved_count = mcs->mc_saved_count;
 	int i;
 
 	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
@@ -342,7 +339,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	for (i = 0; i < mc_saved_count; i++)
 		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
 
-	mc_saved_data->mc_saved_count = mc_saved_count;
+	mcs->mc_saved_count = mc_saved_count;
 out:
 	return state;
 }
@@ -536,7 +533,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long start, unsigned long size,
 	       struct ucode_cpu_info *uci)
 {
@@ -562,7 +559,7 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mc_saved_data, initrd, uci);
+					    mcs, initrd, uci);
 }
 
 /*
@@ -702,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void)
 }
 
 static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-		      unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
 		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
@@ -711,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 
 	collect_cpu_info_early(&uci);
 
-	ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+	ret = scan_microcode(mcs, initrd, start, size, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-	ret = load_microcode(mc_saved_data, initrd, start, &uci);
+	ret = load_microcode(mcs, initrd, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
@@ -750,28 +746,28 @@ void __init load_ucode_intel_bsp(void)
 
 void load_ucode_intel_ap(void)
 {
-	struct mc_saved_data *mc_saved_data_p;
-	struct ucode_cpu_info uci;
 	unsigned long *mc_saved_in_initrd_p;
+	struct mc_saved_data *mcs_p;
+	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
 
 	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
-	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+	mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
 #else
 	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	mc_saved_data_p = &mc_saved_data;
+	mcs_p = &mc_saved_data;
 #endif
 
 	/*
 	 * If there is no valid ucode previously saved in memory, no need to
 	 * update ucode on this AP.
 	 */
-	if (mc_saved_data_p->mc_saved_count == 0)
+	if (mcs_p->mc_saved_count == 0)
 		return;
 
 	collect_cpu_info_early(&uci);
-	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+	ret = load_microcode(mcs_p, mc_saved_in_initrd_p,
 			     get_initrd_start_addr(), &uci);
 
 	if (ret != UCODE_OK)
-- 
cgit 


From 4fe9349fc3b042b481692b577bda97cde4d6f517 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:36 +0100
Subject: x86/microcode/intel: Rename mc_saved_count to num_saved

It is shorter and easier on the eyes. Change the "== 0" tests to "!..."
while at it.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 61 ++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 4af30bee2161..9f5fe72419ba 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -41,7 +41,7 @@
 
 static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
 static struct mc_saved_data {
-	unsigned int mc_saved_count;
+	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
 } mc_saved_data;
 
@@ -96,7 +96,7 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 
 	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
 
-	for (i = 0; i < mcs->mc_saved_count; i++) {
+	for (i = 0; i < mcs->num_saved; i++) {
 		struct microcode_intel *p;
 
 		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
@@ -110,7 +110,7 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mcs->mc_saved_count;
+	unsigned int count = mcs->num_saved;
 
 	if (!mcs->mc_saved) {
 		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
@@ -174,23 +174,23 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 static int
 save_microcode(struct mc_saved_data *mcs,
 	       struct microcode_intel **mc_saved_src,
-	       unsigned int mc_saved_count)
+	       unsigned int num_saved)
 {
 	int i, j;
 	struct microcode_intel **saved_ptr;
 	int ret;
 
-	if (!mc_saved_count)
+	if (!num_saved)
 		return -EINVAL;
 
 	/*
 	 * Copy new microcode data.
 	 */
-	saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+	saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
 	if (!saved_ptr)
 		return -ENOMEM;
 
-	for (i = 0; i < mc_saved_count; i++) {
+	for (i = 0; i < num_saved; i++) {
 		struct microcode_header_intel *mc_hdr;
 		struct microcode_intel *mc;
 		unsigned long size;
@@ -216,8 +216,8 @@ save_microcode(struct mc_saved_data *mcs,
 	/*
 	 * Point to newly saved microcode.
 	 */
-	mcs->mc_saved	    = saved_ptr;
-	mcs->mc_saved_count = mc_saved_count;
+	mcs->mc_saved  = saved_ptr;
+	mcs->num_saved = num_saved;
 
 	return 0;
 
@@ -293,10 +293,10 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_size;
 	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mcs->mc_saved_count;
+	unsigned int num_saved = mcs->num_saved;
 	int i;
 
-	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
 
 		if (leftover < sizeof(mc_header))
 			break;
@@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 			continue;
 		}
 
-		mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+		num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
 
 		ucode_ptr += mc_size;
 	}
@@ -331,15 +331,15 @@ get_matching_model_microcode(int cpu, unsigned long start,
 		goto out;
 	}
 
-	if (mc_saved_count == 0) {
+	if (!num_saved) {
 		state = UCODE_NFOUND;
 		goto out;
 	}
 
-	for (i = 0; i < mc_saved_count; i++)
+	for (i = 0; i < num_saved; i++)
 		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
 
-	mcs->mc_saved_count = mc_saved_count;
+	mcs->num_saved = num_saved;
 out:
 	return state;
 }
@@ -393,11 +393,11 @@ static void show_saved_mc(void)
 	unsigned int sig, pf, rev, total_size, data_size, date;
 	struct ucode_cpu_info uci;
 
-	if (mc_saved_data.mc_saved_count == 0) {
+	if (!mc_saved_data.num_saved) {
 		pr_debug("no microcode data saved.\n");
 		return;
 	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+	pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
 
 	collect_cpu_info_early(&uci);
 
@@ -406,7 +406,7 @@ static void show_saved_mc(void)
 	rev = uci.cpu_sig.rev;
 	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
-	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+	for (i = 0; i < mc_saved_data.num_saved; i++) {
 		struct microcode_header_intel *mc_saved_header;
 		struct extended_sigtable *ext_header;
 		int ext_sigcount;
@@ -462,7 +462,7 @@ int save_mc_for_early(u8 *mc)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	unsigned int mc_saved_count_init;
-	unsigned int mc_saved_count;
+	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
 	int ret = 0;
 	int i;
@@ -473,23 +473,23 @@ int save_mc_for_early(u8 *mc)
 	 */
 	mutex_lock(&x86_cpu_microcode_mutex);
 
-	mc_saved_count_init = mc_saved_data.mc_saved_count;
-	mc_saved_count = mc_saved_data.mc_saved_count;
+	mc_saved_count_init = mc_saved_data.num_saved;
+	num_saved = mc_saved_data.num_saved;
 	mc_saved = mc_saved_data.mc_saved;
 
-	if (mc_saved && mc_saved_count)
+	if (mc_saved && num_saved)
 		memcpy(mc_saved_tmp, mc_saved,
-		       mc_saved_count * sizeof(struct microcode_intel *));
+		       num_saved * sizeof(struct microcode_intel *));
 	/*
 	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
 	 * version.
 	 */
-	mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+	num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
 
 	/*
 	 * Save the mc_save_tmp in global mc_saved_data.
 	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+	ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
 	if (ret) {
 		pr_err("Cannot save microcode patch.\n");
 		goto out;
@@ -681,14 +681,15 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
  */
 int __init save_microcode_in_initrd_intel(void)
 {
-	unsigned int count = mc_saved_data.mc_saved_count;
+	unsigned int count = mc_saved_data.num_saved;
 	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
 	int ret = 0;
 
-	if (count == 0)
+	if (!count)
 		return ret;
 
 	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
+
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
 		pr_err("Cannot save microcode patches from initrd.\n");
@@ -763,7 +764,7 @@ void load_ucode_intel_ap(void)
 	 * If there is no valid ucode previously saved in memory, no need to
 	 * update ucode on this AP.
 	 */
-	if (mcs_p->mc_saved_count == 0)
+	if (!mcs_p->num_saved)
 		return;
 
 	collect_cpu_info_early(&uci);
@@ -781,13 +782,13 @@ void reload_ucode_intel(void)
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 
-	if (!mc_saved_data.mc_saved_count)
+	if (!mc_saved_data.num_saved)
 		return;
 
 	collect_cpu_info_early(&uci);
 
 	ret = load_microcode_early(mc_saved_data.mc_saved,
-				   mc_saved_data.mc_saved_count, &uci);
+				   mc_saved_data.num_saved, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit 


From de778275c295825e6638f3f74103f40642d45caa Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:37 +0100
Subject: x86/microcode/intel: Rename mc_intel variable to mc

Well, it is apparent what it points to - microcode. And since it is the
intel loader, no need for the "_intel" suffix. Use "!" for the 0/NULL
checks, while at it.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 58 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 9f5fe72419ba..d1b2f583f543 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -601,19 +601,19 @@ void show_ucode_info_early(void)
  */
 static void print_ucode(struct ucode_cpu_info *uci)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	int *delay_ucode_info_p;
 	int *current_mc_date_p;
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return;
 
 	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
 	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
 
 	*delay_ucode_info_p = 1;
-	*current_mc_date_p = mc_intel->hdr.date;
+	*current_mc_date_p = mc->hdr.date;
 }
 #else
 
@@ -628,29 +628,29 @@ static inline void flush_tlb_early(void)
 
 static inline void print_ucode(struct ucode_cpu_info *uci)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return;
 
-	print_ucode_info(uci, mc_intel->hdr.date);
+	print_ucode_info(uci, mc->hdr.date);
 }
 #endif
 
 static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	unsigned int val[2];
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
 	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	      (unsigned long)mc->bits,
+	      (unsigned long)mc->bits >> 16 >> 16);
 	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
@@ -658,7 +658,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 
 	/* get the current revision from MSR 0x8B */
 	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-	if (val[1] != mc_intel->hdr.rev)
+	if (val[1] != mc->hdr.rev)
 		return -1;
 
 #ifdef CONFIG_X86_64
@@ -670,7 +670,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	if (early)
 		print_ucode(uci);
 	else
-		print_ucode_info(uci, mc_intel->hdr.date);
+		print_ucode_info(uci, mc->hdr.date);
 
 	return 0;
 }
@@ -821,7 +821,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
  * return 0 - no update found
  * return 1 - found update
  */
-static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
+static int get_matching_mc(struct microcode_intel *mc, int cpu)
 {
 	struct cpu_signature cpu_sig;
 	unsigned int csig, cpf, crev;
@@ -832,38 +832,38 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return has_newer_microcode(mc_intel, csig, cpf, crev);
+	return has_newer_microcode(mc, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	struct ucode_cpu_info *uci;
 	unsigned int val[2];
 	int cpu_num = raw_smp_processor_id();
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
 	uci = ucode_cpu_info + cpu;
-	mc_intel = uci->mc;
+	mc = uci->mc;
 
 	/* We should bind the task to the CPU */
 	BUG_ON(cpu_num != cpu);
 
-	if (mc_intel == NULL)
+	if (!mc)
 		return 0;
 
 	/*
 	 * Microcode on this CPU could be updated earlier. Only apply the
-	 * microcode patch in mc_intel when it is newer than the one on this
+	 * microcode patch in mc when it is newer than the one on this
 	 * CPU.
 	 */
-	if (get_matching_mc(mc_intel, cpu) == 0)
+	if (!get_matching_mc(mc, cpu))
 		return 0;
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	      (unsigned long) mc->bits,
+	      (unsigned long) mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
@@ -872,16 +872,16 @@ static int apply_microcode_intel(int cpu)
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-	if (val[1] != mc_intel->hdr.rev) {
+	if (val[1] != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
-		       cpu_num, mc_intel->hdr.rev);
+		       cpu_num, mc->hdr.rev);
 		return -1;
 	}
 	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
 		cpu_num, val[1],
-		mc_intel->hdr.date & 0xffff,
-		mc_intel->hdr.date >> 24,
-		(mc_intel->hdr.date >> 16) & 0xff);
+		mc->hdr.date & 0xffff,
+		mc->hdr.date >> 24,
+		(mc->hdr.date >> 16) & 0xff);
 
 	uci->cpu_sig.rev = val[1];
 	c->microcode = val[1];
-- 
cgit 


From 58b5f2cc4bdbc9b616e68639f5a84886aa5be590 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:38 +0100
Subject: x86/microcode/intel: Move the BUG_ON up and turn it into WARN_ON

If we're going to BUG_ON() because we're running on the wrong CPU, we
better do it as the first thing we do when entering that function. And
also, turn it into a WARN_ON() because it is not worth to panic the
system if we apply the microcode on the wrong CPU - we're simply going
to exit early.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index d1b2f583f543..c029c2bb2a29 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -843,12 +843,12 @@ static int apply_microcode_intel(int cpu)
 	int cpu_num = raw_smp_processor_id();
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
-	uci = ucode_cpu_info + cpu;
-	mc = uci->mc;
-
 	/* We should bind the task to the CPU */
-	BUG_ON(cpu_num != cpu);
+	if (WARN_ON(cpu_num != cpu))
+		return -1;
 
+	uci = ucode_cpu_info + cpu;
+	mc = uci->mc;
 	if (!mc)
 		return 0;
 
-- 
cgit 


From 26cbaa4dc676a444aa626cbc642c4c8181ef1378 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:39 +0100
Subject: x86/microcode/intel: Cleanup apply_microcode_intel()

Get rid of local variable cpu_num as it is equal to @cpu now. Deref
cpu_data() only when it is really needed at the end.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c029c2bb2a29..35186a0dd5fc 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -839,12 +839,11 @@ static int apply_microcode_intel(int cpu)
 {
 	struct microcode_intel *mc;
 	struct ucode_cpu_info *uci;
+	struct cpuinfo_x86 *c;
 	unsigned int val[2];
-	int cpu_num = raw_smp_processor_id();
-	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
 	/* We should bind the task to the CPU */
-	if (WARN_ON(cpu_num != cpu))
+	if (WARN_ON(raw_smp_processor_id() != cpu))
 		return -1;
 
 	uci = ucode_cpu_info + cpu;
@@ -874,15 +873,18 @@ static int apply_microcode_intel(int cpu)
 
 	if (val[1] != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
-		       cpu_num, mc->hdr.rev);
+		       cpu, mc->hdr.rev);
 		return -1;
 	}
+
 	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu_num, val[1],
+		cpu, val[1],
 		mc->hdr.date & 0xffff,
 		mc->hdr.date >> 24,
 		(mc->hdr.date >> 16) & 0xff);
 
+	c = &cpu_data(cpu);
+
 	uci->cpu_sig.rev = val[1];
 	c->microcode = val[1];
 
-- 
cgit 


From c416e6117575213a5a962149620684a09f9e4ece Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:40 +0100
Subject: x86/microcode/intel: Use *wrmsrl variants

... and drop the 32-bit casting games which we had to do at the time
because wrmsr() was unforgiving then, see c3fd0bd5e19a from the
full history tree:

  commit c3fd0bd5e19aaff9cdd104edff136a2023db657e
  Author: Linus Torvalds <torvalds@home.osdl.org>
  Date:   Tue Feb 17 23:23:41 2004 -0800

    Fix up the microcode update on regular 32-bit x86. Our wrmsr()
    is a bit unforgiving and really doesn't like 64-bit values.
    ...

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 35186a0dd5fc..ff0b44951d12 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -370,7 +370,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
 		csig.pf = 1 << ((val[1] >> 18) & 7);
 	}
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
@@ -648,10 +648,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
-	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long)mc->bits,
-	      (unsigned long)mc->bits >> 16 >> 16);
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
@@ -860,10 +858,8 @@ static int apply_microcode_intel(int cpu)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
-	wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc->bits,
-	      (unsigned long) mc->bits >> 16 >> 16);
-	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+	wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
-- 
cgit 


From f8bb45e2c4acf1395bb6e61a135ce8c9107388cf Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:41 +0100
Subject: x86/microcode/intel: Rename mc_saved_in_initrd

Rename it to mc_tmp_ptrs to denote better what it is - a temporary array
for saving pointers to microcode blobs. And "initrd" is not accurate
anymore since initrd is not the only source for early microcode.
Therefore, rename copy_initrd_ptrs() to copy_ptrs() simply and
"initrd_start" to "offset".

And then do the following convention: the global variable is called
"mc_tmp_ptrs" and the local function arguments "mc_ptrs" for
differentiation.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-14-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 52 +++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ff0b44951d12..5970758bbcdd 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,7 +39,13 @@
 #include <asm/setup.h>
 #include <asm/msr.h>
 
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+/*
+ * Temporary microcode blobs pointers storage. We note here the pointers to
+ * microcode blobs we've got from whatever storage (detached initrd, builtin).
+ * Later on, we put those into final storage mc_saved_data.mc_saved.
+ */
+static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
+
 static struct mc_saved_data {
 	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
@@ -78,13 +84,13 @@ load_microcode_early(struct microcode_intel **saved,
 }
 
 static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
-		  unsigned long off, int num_saved)
+copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
+	  unsigned long off, int num_saved)
 {
 	int i;
 
 	for (i = 0; i < num_saved; i++)
-		mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+		mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
 }
 
 #ifdef CONFIG_X86_32
@@ -106,14 +112,14 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
-	       unsigned long initrd_start, struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
+	       unsigned long offset, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	unsigned int count = mcs->num_saved;
 
 	if (!mcs->mc_saved) {
-		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+		copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
 
 		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
@@ -284,7 +290,7 @@ static enum ucode_state __init
 get_matching_model_microcode(int cpu, unsigned long start,
 			     void *data, size_t size,
 			     struct mc_saved_data *mcs,
-			     unsigned long *mc_saved_in_initrd,
+			     unsigned long *mc_ptrs,
 			     struct ucode_cpu_info *uci)
 {
 	u8 *ucode_ptr = data;
@@ -337,7 +343,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	}
 
 	for (i = 0; i < num_saved; i++)
-		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
 
 	mcs->num_saved = num_saved;
 out:
@@ -533,7 +539,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 	       unsigned long start, unsigned long size,
 	       struct ucode_cpu_info *uci)
 {
@@ -559,7 +565,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mcs, initrd, uci);
+					    mcs, mc_ptrs, uci);
 }
 
 /*
@@ -675,7 +681,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 
 /*
  * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
  */
 int __init save_microcode_in_initrd_intel(void)
 {
@@ -686,7 +692,7 @@ int __init save_microcode_in_initrd_intel(void)
 	if (!count)
 		return ret;
 
-	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
+	copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
 
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
@@ -698,7 +704,7 @@ int __init save_microcode_in_initrd_intel(void)
 }
 
 static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
@@ -706,11 +712,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
 
 	collect_cpu_info_early(&uci);
 
-	ret = scan_microcode(mcs, initrd, start, size, &uci);
+	ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-	ret = load_microcode(mcs, initrd, start, &uci);
+	ret = load_microcode(mcs, mc_ptrs, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
@@ -733,28 +739,28 @@ void __init load_ucode_intel_bsp(void)
 	start	= (size ? p->hdr.ramdisk_image : 0);
 
 	_load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-			      (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			      (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
 			      start, size);
 #else
 	size	= boot_params.hdr.ramdisk_size;
 	start	= (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
 
-	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+	_load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
 #endif
 }
 
 void load_ucode_intel_ap(void)
 {
-	unsigned long *mc_saved_in_initrd_p;
+	unsigned long *mcs_tmp_p;
 	struct mc_saved_data *mcs_p;
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
 
-	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
 	mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
 #else
-	mc_saved_in_initrd_p = mc_saved_in_initrd;
+	mcs_tmp_p = mc_tmp_ptrs;
 	mcs_p = &mc_saved_data;
 #endif
 
@@ -766,9 +772,7 @@ void load_ucode_intel_ap(void)
 		return;
 
 	collect_cpu_info_early(&uci);
-	ret = load_microcode(mcs_p, mc_saved_in_initrd_p,
-			     get_initrd_start_addr(), &uci);
-
+	ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit 


From 2f303c524ed021825671cfa9b1934338bc04f8ab Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:42 +0100
Subject: x86/microcode/intel: Remove unused arg of
 get_matching_model_microcode()

@cpu is unused, kill it.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5970758bbcdd..0c67fd060680 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -287,7 +287,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
  * BSP can stay in the platform.
  */
 static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
+get_matching_model_microcode(unsigned long start,
 			     void *data, size_t size,
 			     struct mc_saved_data *mcs,
 			     unsigned long *mc_ptrs,
@@ -564,7 +564,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 			return UCODE_ERROR;
 	}
 
-	return get_matching_model_microcode(0, start, cd.data, cd.size,
+	return get_matching_model_microcode(start, cd.data, cd.size,
 					    mcs, mc_ptrs, uci);
 }
 
-- 
cgit 


From f96fde531946524b26d25d4eed9625695837f524 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:43 +0100
Subject: x86/microcode/intel: Cleanup get_matching_model_microcode()

Reflow arguments, sort local variables in reverse christmas tree, kill
"out" label.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 0c67fd060680..cb397947f688 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -287,19 +287,17 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
  * BSP can stay in the platform.
  */
 static enum ucode_state __init
-get_matching_model_microcode(unsigned long start,
-			     void *data, size_t size,
-			     struct mc_saved_data *mcs,
-			     unsigned long *mc_ptrs,
+get_matching_model_microcode(unsigned long start, void *data, size_t size,
+			     struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 			     struct ucode_cpu_info *uci)
 {
-	u8 *ucode_ptr = data;
-	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	struct microcode_header_intel *mc_header;
 	unsigned int num_saved = mcs->num_saved;
+	enum ucode_state state = UCODE_OK;
+	unsigned int leftover = size;
+	u8 *ucode_ptr = data;
+	unsigned int mc_size;
 	int i;
 
 	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
@@ -321,8 +319,7 @@ get_matching_model_microcode(unsigned long start,
 		 * the platform, we need to find and save microcode patches
 		 * with the same family and model as the BSP.
 		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
-			 UCODE_OK) {
+		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
 			ucode_ptr += mc_size;
 			continue;
 		}
@@ -334,19 +331,19 @@ get_matching_model_microcode(unsigned long start,
 
 	if (leftover) {
 		state = UCODE_ERROR;
-		goto out;
+		return state;
 	}
 
 	if (!num_saved) {
 		state = UCODE_NFOUND;
-		goto out;
+		return state;
 	}
 
 	for (i = 0; i < num_saved; i++)
 		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
 
 	mcs->num_saved = num_saved;
-out:
+
 	return state;
 }
 
-- 
cgit 


From f7eb59dda129e46be5e195a46bfd0dde76db9bbd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:44 +0100
Subject: x86/microcode/AMD: Issue microcode updated message later

Before this, we issued this message from save_microcode_in_initrd()
which is called from free_initrd_mem(), i.e., only when we have an
initrd enabled. However, we can update from builtin microcode too but
then we don't issue the update message.

Fix it by issuing that message on the generic driver init path.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-17-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 9f5ccef33fba..f66cbfe74ce4 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void)
 	else
 		container = cont_va;
 
-	if (ucode_new_rev)
-		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
-			     ucode_new_rev);
-
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 
@@ -956,6 +952,10 @@ struct microcode_ops * __init init_amd_microcode(void)
 		return NULL;
 	}
 
+	if (ucode_new_rev)
+		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+			     ucode_new_rev);
+
 	return &microcode_amd_ops;
 }
 
-- 
cgit 


From 06bea3dbfe6a4c333c4333362c46bdf4d9e43504 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 4 Feb 2016 11:29:36 -0800
Subject: locking/lockdep: Eliminate lockdep_init()

Lockdep is initialized at compile time now.  Get rid of lockdep_init().

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Krinkin <krinkin.m.u@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: mm-commits@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lguest/boot.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229ac3f4f..f56cc418c87d 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
 	 */
 	reserve_top_address(lguest_data.reserve_mem);
 
-	/*
-	 * If we don't initialize the lock dependency checker now, it crashes
-	 * atomic_notifier_chain_register, then paravirt_disable_iospace.
-	 */
-	lockdep_init();
-
 	/* Hook in our special panic hypercall code. */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
-- 
cgit 


From b51012deb390528d89d426f328d84618683f5d73 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 Jan 2016 11:39:22 +0100
Subject: KVM: x86: introduce do_shl32_div32

This is similar to the existing div_frac function, but it returns the
remainder too.  Unlike div_frac, it can be used to implement long
division, e.g. (a << 64) / b for 32-bit a and b.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++--------
 arch/x86/kvm/x86.h | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4244c2baf57d..5b937fdebc66 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1196,14 +1196,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
-	uint32_t quotient, remainder;
-
-	/* Don't try to replace with do_div(), this one calculates
-	 * "(dividend << 32) / divisor" */
-	__asm__ ( "divl %4"
-		  : "=a" (quotient), "=d" (remainder)
-		  : "0" (0), "1" (dividend), "r" (divisor) );
-	return quotient;
+	do_shl32_div32(dividend, divisor);
+	return dividend;
 }
 
 static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f2afa5fe48a6..34f416427143 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -192,4 +192,19 @@ extern unsigned int min_timer_period_us;
 extern unsigned int lapic_timer_advance_ns;
 
 extern struct static_key kvm_no_apic_vcpu;
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base)					\
+	({							\
+	    u32 __quot, __rem;					\
+	    asm("divl %2" : "=a" (__quot), "=d" (__rem)		\
+			: "rm" (base), "0" (0), "1" ((u32) n));	\
+	    n = __quot;						\
+	    __rem;						\
+	 })
+
 #endif
-- 
cgit 


From 23a1c2579b575b228a6c685dfe93f296d3d5e0e1 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Mon, 25 Jan 2016 16:53:32 +0800
Subject: KVM: Recover IRTE to remapped mode if the interrupt is not
 single-destination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the interrupt is not single destination any more, we need
to change back IRTE to remapped mode explicitly.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6edbbc..a4b4aa4cdc54 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10764,8 +10764,21 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		 */
 
 		kvm_set_msi_irq(e, &irq);
-		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+			/*
+			 * Make sure the IRTE is in remapped mode if
+			 * we don't handle it in posted mode.
+			 */
+			ret = irq_set_vcpu_affinity(host_irq, NULL);
+			if (ret < 0) {
+				printk(KERN_INFO
+				   "failed to back to remapped mode, irq: %u\n",
+				   host_irq);
+				goto out;
+			}
+
 			continue;
+		}
 
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
-- 
cgit 


From 520040146a0af36f7875ec06b58f44b19a0edf53 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Mon, 25 Jan 2016 16:53:33 +0800
Subject: KVM: x86: Use vector-hashing to deliver lowest-priority interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use vector-hashing to deliver lowest-priority interrupts, As an
example, modern Intel CPUs in server platform use this method to
handle lowest-priority interrupts.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/irq_comm.c         | 25 +++++++++++++++++----
 arch/x86/kvm/lapic.c            | 50 ++++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/lapic.h            |  2 ++
 arch/x86/kvm/x86.c              |  9 ++++++++
 arch/x86/kvm/x86.h              |  1 +
 6 files changed, 82 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44adbb819041..7b5459982433 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -754,6 +754,8 @@ struct kvm_arch {
 
 	bool irqchip_split;
 	u8 nr_reserved_ioapic_pins;
+
+	bool disabled_lapic_found;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8fc89efb5250..37217363887d 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -34,6 +34,7 @@
 #include "lapic.h"
 
 #include "hyperv.h"
+#include "x86.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -57,6 +58,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 {
 	int i, r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
+	unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+	unsigned int dest_vcpus = 0;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_lowest_prio_delivery(irq)) {
@@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
 		return r;
 
+	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
@@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
 		} else if (kvm_lapic_enabled(vcpu)) {
-			if (!lowest)
-				lowest = vcpu;
-			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
-				lowest = vcpu;
+			if (!kvm_vector_hashing_enabled()) {
+				if (!lowest)
+					lowest = vcpu;
+				else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+					lowest = vcpu;
+			} else {
+				__set_bit(i, dest_vcpu_bitmap);
+				dest_vcpus++;
+			}
 		}
 	}
 
+	if (dest_vcpus != 0) {
+		int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+					dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+		lowest = kvm_get_vcpu(kvm, idx);
+	}
+
 	if (lowest)
 		r = kvm_apic_set_irq(lowest, irq, dest_map);
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591faed13b..1a4ca1d05fe9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -675,6 +675,22 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	}
 }
 
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+		       const unsigned long *bitmap, u32 bitmap_size)
+{
+	u32 mod;
+	int i, idx = -1;
+
+	mod = vector % dest_vcpus;
+
+	for (i = 0; i <= mod; i++) {
+		idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+		BUG_ON(idx == bitmap_size);
+	}
+
+	return idx;
+}
+
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
 {
@@ -727,21 +743,49 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 		dst = map->logical_map[cid];
 
-		if (kvm_lowest_prio_delivery(irq)) {
+		if (!kvm_lowest_prio_delivery(irq))
+			goto set_irq;
+
+		if (!kvm_vector_hashing_enabled()) {
 			int l = -1;
 			for_each_set_bit(i, &bitmap, 16) {
 				if (!dst[i])
 					continue;
 				if (l < 0)
 					l = i;
-				else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+				else if (kvm_apic_compare_prio(dst[i]->vcpu,
+							dst[l]->vcpu) < 0)
 					l = i;
 			}
-
 			bitmap = (l >= 0) ? 1 << l : 0;
+		} else {
+			int idx;
+			unsigned int dest_vcpus;
+
+			dest_vcpus = hweight16(bitmap);
+			if (dest_vcpus == 0)
+				goto out;
+
+			idx = kvm_vector_to_index(irq->vector,
+				dest_vcpus, &bitmap, 16);
+
+			/*
+			 * We may find a hardware disabled LAPIC here, if that
+			 * is the case, print out a error message once for each
+			 * guest and return.
+			 */
+			if (!dst[idx] && !kvm->arch.disabled_lapic_found) {
+				kvm->arch.disabled_lapic_found = true;
+				printk(KERN_INFO
+					"Disabled LAPIC found during irq injection\n");
+				goto out;
+			}
+
+			bitmap = (idx >= 0) ? 1 << idx : 0;
 		}
 	}
 
+set_irq:
 	for_each_set_bit(i, &bitmap, 16) {
 		if (!dst[i])
 			continue;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 41bdb35b4b67..afccf4099b00 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -175,4 +175,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+			const unsigned long *bitmap, u32 bitmap_size);
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b937fdebc66..aafbcf9f9776 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
 static bool __read_mostly backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -8364,6 +8367,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
 }
 
+bool kvm_vector_hashing_enabled(void)
+{
+	return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 34f416427143..007940faa5c6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -179,6 +179,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
+bool kvm_vector_hashing_enabled(void);
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-- 
cgit 


From 6228a0da805792c2f25b32e9b926d0810a6648ab Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Mon, 25 Jan 2016 16:53:34 +0800
Subject: KVM: x86: Add lowest-priority support for vt-d posted-interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use vector-hashing to deliver lowest-priority interrupts for
VT-d posted-interrupts. This patch extends kvm_intr_is_single_vcpu()
to support lowest-priority handling.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1a4ca1d05fe9..1520d1acd0ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -798,6 +798,20 @@ out:
 	return ret;
 }
 
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ *   interrupt, handle it in posted mode and use the following mechanism
+ *   to find the destinaiton vCPU.
+ *	1. For lowest-priority interrupts, store all the possible
+ *	   destination vCPUs in an array.
+ *	2. Use "guest vector % max number of destination vCPUs" to find
+ *	   the right destination vCPU in the array for the lowest-priority
+ *	   interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu)
 {
@@ -839,16 +853,44 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 		if (cid >= ARRAY_SIZE(map->logical_map))
 			goto out;
 
-		for_each_set_bit(i, &bitmap, 16) {
-			dst = map->logical_map[cid][i];
-			if (++r == 2)
+		if (kvm_vector_hashing_enabled() &&
+				kvm_lowest_prio_delivery(irq)) {
+			int idx;
+			unsigned int dest_vcpus;
+
+			dest_vcpus = hweight16(bitmap);
+			if (dest_vcpus == 0)
 				goto out;
-		}
 
-		if (dst && kvm_apic_present(dst->vcpu))
+			idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+						  &bitmap, 16);
+
+			/*
+			 * We may find a hardware disabled LAPIC here, if that
+			 * is the case, print out a error message once for each
+			 * guest and return
+			 */
+			dst = map->logical_map[cid][idx];
+			if (!dst && !kvm->arch.disabled_lapic_found) {
+				kvm->arch.disabled_lapic_found = true;
+				printk(KERN_INFO
+					"Disabled LAPIC found during irq injection\n");
+				goto out;
+			}
+
 			*dest_vcpu = dst->vcpu;
-		else
-			goto out;
+		} else {
+			for_each_set_bit(i, &bitmap, 16) {
+				dst = map->logical_map[cid][i];
+				if (++r == 2)
+					goto out;
+			}
+
+			if (dst && kvm_apic_present(dst->vcpu))
+				*dest_vcpu = dst->vcpu;
+			else
+				goto out;
+		}
 	}
 
 	ret = true;
-- 
cgit 


From b6ce978067e75187d3c30f59b60d390a29374fab Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Mon, 25 Jan 2016 16:53:35 +0800
Subject: KVM/VMX: Add host irq information in trace event when updating IRTE
 for posted interrupts

Add host irq information in trace event, so we can better understand
which irq is in posted mode.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/trace.h | 12 ++++++++----
 arch/x86/kvm/vmx.c   |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ad9f6a23f139..2f1ea2f61e1f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm,
  * Tracepoint for VT-d posted-interrupts.
  */
 TRACE_EVENT(kvm_pi_irte_update,
-	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
-		 unsigned int gvec, u64 pi_desc_addr, bool set),
-	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+	TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+		 unsigned int gsi, unsigned int gvec,
+		 u64 pi_desc_addr, bool set),
+	TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
 
 	TP_STRUCT__entry(
+		__field(	unsigned int,	host_irq	)
 		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned int,	gsi		)
 		__field(	unsigned int,	gvec		)
@@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update,
 	),
 
 	TP_fast_assign(
+		__entry->host_irq	= host_irq;
 		__entry->vcpu_id	= vcpu_id;
 		__entry->gsi		= gsi;
 		__entry->gvec		= gvec;
@@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update,
 		__entry->set		= set;
 	),
 
-	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+	TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
 		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
 		  __entry->set ? "enabled and being updated" : "disabled",
+		  __entry->host_irq,
 		  __entry->vcpu_id,
 		  __entry->gsi,
 		  __entry->gvec,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4b4aa4cdc54..164eb9e1678b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10783,7 +10783,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
 
-		trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
 		if (set)
-- 
cgit 


From 69e0210fd01ff157d332102219aaf5c26ca8069b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:18 +0300
Subject: x86/kasan: Clear kasan_zero_page after TLB flush

Currently we clear kasan_zero_page before __flush_tlb_all(). This
works with current implementation of native_flush_tlb[_global]()
because it doesn't cause do any writes to kasan shadow memory.
But any subtle change made in native_flush_tlb*() could break this.
Also current code seems doesn't work for paravirt guests (lguest).

Only after the TLB flush we can be sure that kasan_zero_page is not
used as early shadow anymore (instrumented code will not write to it).
So it should cleared it only after the TLB flush.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/kasan_init_64.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index d470cf219a2d..303e47045864 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -120,11 +120,16 @@ void __init kasan_init(void)
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
-	memset(kasan_zero_page, 0, PAGE_SIZE);
-
 	load_cr3(init_level4_pgt);
 	__flush_tlb_all();
-	init_task.kasan_depth = 0;
 
+	/*
+	 * kasan_zero_page has been used as early shadow memory, thus it may
+	 * contain some garbage. Now we can clear it, since after the TLB flush
+	 * no one should write to it.
+	 */
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
 }
-- 
cgit 


From 063fb3e56f6dd29b2633b678b837e1d904200e6f Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:19 +0300
Subject: x86/kasan: Write protect kasan zero shadow

After kasan_init() executed, no one is allowed to write to kasan_zero_page,
so write protect it.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/kasan_init_64.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 303e47045864..1b1110fa0057 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -125,10 +125,16 @@ void __init kasan_init(void)
 
 	/*
 	 * kasan_zero_page has been used as early shadow memory, thus it may
-	 * contain some garbage. Now we can clear it, since after the TLB flush
-	 * no one should write to it.
+	 * contain some garbage. Now we can clear and write protect it, since
+	 * after the TLB flush no one should write to it.
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		set_pte(&kasan_zero_pte[i], pte);
+	}
+	/* Flush TLBs again to be sure that write protection applied. */
+	__flush_tlb_all();
 
 	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
-- 
cgit 


From 060a402a1ddb551455ee410de2eadd3349f2801b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:57 -0800
Subject: x86/mm: Add INVPCID helpers

This adds helpers for each of the four currently-specified INVPCID
modes.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 48 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029405a3..8b576832777e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,54 @@
 #include <asm/processor.h>
 #include <asm/special_insns.h>
 
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+			     unsigned long type)
+{
+	u64 desc[2] = { pcid, addr };
+
+	/*
+	 * The memory clobber is because the whole point is to invalidate
+	 * stale TLB entries and, especially if we're flushing global
+	 * mappings, we don't want the compiler to reorder any subsequent
+	 * memory accesses before the TLB flush.
+	 *
+	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+	 * invpcid (%rcx), %rax in long mode.
+	 */
+	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR		0
+#define INVPCID_TYPE_SINGLE_CTXT	1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
+#define INVPCID_TYPE_ALL_NON_GLOBAL	3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+				     unsigned long addr)
+{
+	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-- 
cgit 


From d12a72b844a49d4162f24cefdab30bed3f86730e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:58 -0800
Subject: x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID

This adds a chicken bit to turn off INVPCID in case something goes
wrong.  It's an early_param() because we do TLB flushes before we
parse __setup() parameters.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de8f60a..f4d0aa64d934 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+static int __init x86_noinvpcid_setup(char *s)
+{
+	/* noinvpcid doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_INVPCID))
+		return 0;
+
+	setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+	pr_info("noinvpcid: INVPCID feature disabled\n");
+	return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
-- 
cgit 


From d8bced79af1db6734f66b42064cc773cada2ce99 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:59 -0800
Subject: x86/mm: If INVPCID is available, use it to flush global mappings

On my Skylake laptop, INVPCID function 2 (flush absolutely
everything) takes about 376ns, whereas saving flags, twiddling
CR4.PGE to flush global mappings, and restoring flags takes about
539ns.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8b576832777e..fc9a2fda1404 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,6 +152,15 @@ static inline void __native_flush_tlb_global(void)
 {
 	unsigned long flags;
 
+	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Using INVPCID is considerably faster than a pair of writes
+		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 */
+		invpcid_flush_all();
+		return;
+	}
+
 	/*
 	 * Read-modify-write to CR4 - protect it from preemption and
 	 * from interrupts. (Use the raw variant because this code can
-- 
cgit 


From ce1143aa60273220a9f89012f2aaaed04f97e9a2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 25 Jan 2016 23:06:49 -0800
Subject: x86/dmi: Switch dmi_remap() from ioremap() [uncached] to
 ioremap_cache()

DMI cacheability is very confused on x86.

dmi_early_remap() uses early_ioremap(), which uses FIXMAP_PAGE_IO,
which is __PAGE_KERNEL_IO, which is __PAGE_KERNEL, which is cached.

Don't ask me why this makes any sense.

dmi_remap() uses ioremap(), which requests an uncached mapping.

However, on non-EFI systems, the DMI data generally lives between
0xf0000 and 0x100000, which is in the legacy ISA range, which
triggers a special case in the PAT code that overrides the cache
mode requested by ioremap() and forces a WB mapping.

On a UEFI boot, however, the DMI table can live at any physical
address.  On my laptop, it's around 0x77dd0000.  That's nowhere near
the legacy ISA range, so the ioremap() implicit uncached type is
honored and we end up with a UC- mapping.

UC- is a very, very slow way to read from main memory, so dmi_walk()
is likely to take much longer than necessary.

Given that, even on UEFI, we do early cached DMI reads, it seems
safe to just ask for cached access.  Switch to ioremap_cache().

I haven't tried to benchmark this, but I'd guess it saves several
milliseconds of boot time.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Link: http://lkml.kernel.org/r/3147c38e51f439f3c8911db34c7d4ab22d854915.1453791969.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/dmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 535192f6bfad..3c69fed215c5 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_early_remap		early_ioremap
 #define dmi_early_unmap		early_iounmap
-#define dmi_remap		ioremap
+#define dmi_remap		ioremap_cache
 #define dmi_unmap		iounmap
 
 #endif /* _ASM_X86_DMI_H */
-- 
cgit 


From a91bbe017552b80e12d712c85549b933a62c6ed4 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 9 Feb 2016 19:44:54 +0600
Subject: x86/boot: Use proper array element type in memset() size calculation

I changed open coded zeroing loops to explicit memset()s in the
following commit:

  5e9ebbd87a99 ("x86/boot: Micro-optimize reset_early_page_tables()")

The base for the size argument of memset was sizeof(pud_p/pmd_p), which
are pointers - but the initialized array has pud_t/pmd_t elements.

Luckily the two types had the same size, so this did not result in any
runtime misbehavior.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455025494-4063-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 35843caa4ea7..7793a1702204 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -75,7 +75,7 @@ again:
 		}
 
 		pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-		memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD);
+		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
 		*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pud_p += pud_index(address);
@@ -90,7 +90,7 @@ again:
 		}
 
 		pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-		memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD);
+		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
-- 
cgit 


From dd7b6847670a84b7bb7c38f8e69b2f12059bca66 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Mon, 25 Jan 2016 12:25:15 -0500
Subject: x86/mm: Honour passed pgprot in track_pfn_insert() and
 track_pfn_remap()

track_pfn_insert() overwrites the pgprot that is passed in with a value
based on the VMA's page_prot.  This is a problem for people trying to
do clever things with the new vm_insert_pfn_prot() as it will simply
overwrite the passed protection flags.  If we use the current value of
the pgprot as the base, then it will behave as people are expecting.

Also fix track_pfn_remap() in the same way.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1453742717-10326-2-git-send-email-matthew.r.wilcox@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f4ae536b0914..04e2e7144bee 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 			return -EINVAL;
 	}
 
-	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 			 cachemode2protval(pcm));
 
 	return 0;
@@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 
 	/* Set prot based on lookup */
 	pcm = lookup_memtype(pfn_t_to_phys(pfn));
-	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 			 cachemode2protval(pcm));
 
 	return 0;
-- 
cgit 


From 4ecd16ec7059390b430af34bd8bc3ca2b5dcef9a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:06 -0800
Subject: x86/fpu: Fix math emulation in eager fpu mode

Systems without an FPU are generally old and therefore use lazy FPU
switching. Unsurprisingly, math emulation in eager FPU mode is a
bit buggy. Fix it.

There were two bugs involving kernel code trying to use the FPU
registers in eager mode even if they didn't exist and one BUG_ON()
that was incorrect.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/b4b8d112436bd6fab866e1b4011131507e8d7fbe.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 3 ++-
 arch/x86/kernel/fpu/core.c          | 2 +-
 arch/x86/kernel/traps.c             | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0fd440df63f1..a1f78a9fbf41 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -589,7 +589,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
 	 * If the task has used the math, pre-load the FPU on xsave processors
 	 * or if the past 5 consecutive context-switches used math.
 	 */
-	fpu.preload = new_fpu->fpstate_active &&
+	fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
+		      new_fpu->fpstate_active &&
 		      (use_eager_fpu() || new_fpu->counter > 5);
 
 	if (old_fpu->fpregs_active) {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d25097c3fc1d..08e1e11a05ca 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -423,7 +423,7 @@ void fpu__clear(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
-	if (!use_eager_fpu()) {
+	if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
 		/* FPU state will be reallocated lazily at the first use. */
 		fpu__drop(fpu);
 	} else {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..87f80febf477 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -750,7 +750,6 @@ dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
-- 
cgit 


From 5ed73f40735c68d8a656b46d09b1885d3b8740ae Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:07 -0800
Subject: x86/fpu: Fix FNSAVE usage in eagerfpu mode

In eager fpu mode, having deactivated FPU without immediately
reloading some other context is illegal.  Therefore, to recover from
FNSAVE, we can't just deactivate the state -- we need to reload it
if we're not actively context switching.

We had this wrong in fpu__save() and fpu__copy().  Fix both.
__kernel_fpu_begin() was fine -- add a comment.

This fixes a warning triggerable with nofxsr eagerfpu=on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/60662444e13c76f06e23c15c5dcdba31b4ac3d67.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 08e1e11a05ca..7a9244df33e2 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -114,6 +114,10 @@ void __kernel_fpu_begin(void)
 	kernel_fpu_disable();
 
 	if (fpu->fpregs_active) {
+		/*
+		 * Ignore return value -- we don't care if reg state
+		 * is clobbered.
+		 */
 		copy_fpregs_to_fpstate(fpu);
 	} else {
 		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
@@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu)
 
 	preempt_disable();
 	if (fpu->fpregs_active) {
-		if (!copy_fpregs_to_fpstate(fpu))
-			fpregs_deactivate(fpu);
+		if (!copy_fpregs_to_fpstate(fpu)) {
+			if (use_eager_fpu())
+				copy_kernel_to_fpregs(&fpu->state);
+			else
+				fpregs_deactivate(fpu);
+		}
 	}
 	preempt_enable();
 }
@@ -259,7 +267,11 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	preempt_disable();
 	if (!copy_fpregs_to_fpstate(dst_fpu)) {
 		memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
-		fpregs_deactivate(src_fpu);
+
+		if (use_eager_fpu())
+			copy_kernel_to_fpregs(&src_fpu->state);
+		else
+			fpregs_deactivate(src_fpu);
 	}
 	preempt_enable();
 }
-- 
cgit 


From a20d7297045f7fdcd676c15243192eb0e95a4306 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:08 -0800
Subject: x86/fpu: Fold fpu_copy() into fpu__copy()

Splitting it into two functions needlessly obfuscated the code.
While we're at it, improve the comment slightly.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/3eb5a63a9c5c84077b2677a7dfe684eef96fe59e.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 7a9244df33e2..299b58bb975b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -231,14 +231,15 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-/*
- * Copy the current task's FPU state to a new task's FPU context.
- *
- * In both the 'eager' and the 'lazy' case we save hardware registers
- * directly to the destination buffer.
- */
-static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
+	dst_fpu->counter = 0;
+	dst_fpu->fpregs_active = 0;
+	dst_fpu->last_cpu = -1;
+
+	if (!src_fpu->fpstate_active || !cpu_has_fpu)
+		return 0;
+
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
 
 	/*
@@ -251,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	/*
 	 * Save current FPU registers directly into the child
 	 * FPU context, without any memory-to-memory copying.
-	 *
-	 * If the FPU context got destroyed in the process (FNSAVE
-	 * done on old CPUs) then copy it back into the source
-	 * context and mark the current task for lazy restore.
+	 * In lazy mode, if the FPU context isn't loaded into
+	 * fpregs, CR0.TS will be set and do_device_not_available
+	 * will load the FPU context.
 	 *
 	 * We have to do all this with preemption disabled,
 	 * mostly because of the FNSAVE case, because in that
@@ -274,16 +274,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 			fpregs_deactivate(src_fpu);
 	}
 	preempt_enable();
-}
-
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
-{
-	dst_fpu->counter = 0;
-	dst_fpu->fpregs_active = 0;
-	dst_fpu->last_cpu = -1;
-
-	if (src_fpu->fpstate_active && cpu_has_fpu)
-		fpu_copy(dst_fpu, src_fpu);
 
 	return 0;
 }
-- 
cgit 


From c6ab109f7e0eae3bae3bb10f8ddb0df67735c150 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:09 -0800
Subject: x86/fpu: Speed up lazy FPU restores slightly

If we have an FPU, there's no need to check CR0 for FPU emulation.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/980004297e233c27066d54e71382c44cdd36ef7c.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 87f80febf477..36a9c017540e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -752,7 +752,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
-	if (read_cr0() & X86_CR0_EM) {
+	if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
 		struct math_emu_info info = { };
 
 		conditional_sti(regs);
-- 
cgit 


From 58122bf1d856a4ea9581d62a07c557d997d46a19 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:10 -0800
Subject: x86/fpu: Default eagerfpu=on on all CPUs

We have eager and lazy FPU modes, introduced in:

  304bceda6a18 ("x86, fpu: use non-lazy fpu restore for processors supporting xsave")

The result is rather messy.  There are two code paths in almost all
of the FPU code, and only one of them (the eager case) is tested
frequently, since most kernel developers have new enough hardware
that we use eagerfpu.

It seems that, on any remotely recent hardware, eagerfpu is a win:
glibc uses SSE2, so laziness is probably overoptimistic, and, in any
case, manipulating TS is far slower that saving and restoring the
full state.  (Stores to CR0.TS are serializing and are poorly
optimized.)

To try to shake out any latent issues on old hardware, this changes
the default to eager on all CPUs.  If no performance or functionality
problems show up, a subsequent patch could remove lazy mode entirely.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/ac290de61bf08d9cfc2664a4f5080257ffc1075a.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6d9f0a7ef4c8..471fe277ff40 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -260,7 +260,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  * not only saved the restores along the way, but we also have the
  * FPU ready to be used for the original task.
  *
- * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * 'lazy' is deprecated because it's almost never a performance win
+ * and it's much more complicated than 'eager'.
+ *
+ * 'eager' switching is by default on all CPUs, there we switch the FPU
  * state during every context switch, regardless of whether the task
  * has used FPU instructions in that time slice or not. This is done
  * because modern FPU context saving instructions are able to optimize
@@ -271,7 +274,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  *   to use 'eager' restores, if we detect that a task is using the FPU
  *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
  */
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
 
 /*
  * Find supported xfeatures based on cpu features and command-line input.
@@ -348,15 +351,9 @@ static void __init fpu__init_system_ctx_switch(void)
  */
 static void __init fpu__init_parse_early_param(void)
 {
-	/*
-	 * No need to check "eagerfpu=auto" again, since it is the
-	 * initial default.
-	 */
 	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
 		eagerfpu = DISABLE;
 		fpu__clear_eager_fpu_features();
-	} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
-		eagerfpu = ENABLE;
 	}
 
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
-- 
cgit 


From f8543d6a977a1bdb37eb13ad81ef2874526209b0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Jan 2016 13:42:24 +0100
Subject: KVM: APIC: remove unnecessary double checks on APIC existence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Usually the in-kernel APIC's existence is checked in the caller.  Do not
bother checking it again in lapic.c.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1520d1acd0ad..b1029051f664 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -475,18 +475,12 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-	int highest_irr;
-
 	/* This may race with setting of irr in __apic_accept_irq() and
 	 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
 	 * will cause vmexit immediately and the value will be recalculated
 	 * on the next vmentry.
 	 */
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return 0;
-	highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-
-	return highest_irr;
+	return apic_find_highest_irr(vcpu->arch.apic);
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
@@ -1601,8 +1595,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-	if (kvm_vcpu_has_lapic(vcpu))
-		apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+	apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
@@ -1676,9 +1669,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return;
-
 	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
 		     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
@@ -1687,9 +1677,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
 	u64 tpr;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return 0;
-
 	tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
 	return (tpr & 0xf0) >> 4;
@@ -1912,7 +1899,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+	if (!apic_enabled(apic))
 		return -1;
 
 	apic_update_ppr(apic);
-- 
cgit 


From 1e3161b4147caf2045ac4aae3d71fae6ac1a1d65 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Jan 2016 13:41:16 +0100
Subject: KVM: x86: consolidate "has lapic" checks into irq.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do for kvm_cpu_has_pending_timer and kvm_inject_pending_timer_irqs
what the other irq.c routines have been doing.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.c   | 9 ++++++---
 arch/x86/kvm/lapic.c | 6 +-----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3982b479bb5f..95fcc7b13866 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,7 +33,10 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	return apic_has_pending_timer(vcpu);
+	if (lapic_in_kernel(vcpu))
+		return apic_has_pending_timer(vcpu);
+
+	return 0;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
-	kvm_inject_apic_timer_irqs(vcpu);
-	/* TODO: PIT, RTC etc. */
+	if (lapic_in_kernel(vcpu))
+		kvm_inject_apic_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b1029051f664..57e3f27bdadb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1801,8 +1801,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-			apic_lvt_enabled(apic, APIC_LVTT))
+	if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
 		return atomic_read(&apic->lapic_timer.pending);
 
 	return 0;
@@ -1927,9 +1926,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return;
-
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
 		kvm_apic_local_deliver(apic, APIC_LVTT);
 		if (apic_lvtt_tscdeadline(apic))
-- 
cgit 


From bce87cce88c71957c56479809db8316a836ec8b1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Jan 2016 13:48:51 +0100
Subject: KVM: x86: consolidate different ways to test for in-kernel LAPIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Different pieces of code checked for vcpu->arch.apic being (non-)NULL,
or used kvm_vcpu_has_lapic (more optimized) or lapic_in_kernel.
Replace everything with lapic_in_kernel's name and kvm_vcpu_has_lapic's
implementation.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.h   |  8 --------
 arch/x86/kvm/lapic.c | 16 ++++++++--------
 arch/x86/kvm/lapic.h |  8 ++++----
 arch/x86/kvm/pmu.c   |  2 +-
 arch/x86/kvm/x86.c   | 17 +++++++++--------
 5 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ae5c78f2337d..61ebdc13a29a 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
-static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
-{
-	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
-	 * pointer chasing and no unnecessary memory barriers.
-	 */
-	return vcpu->arch.apic != NULL;
-}
-
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57e3f27bdadb..1482a581a83c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *feat;
 	u32 v = APIC_VERSION;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -1319,7 +1319,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 guest_tsc, tsc_deadline;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (apic->lapic_timer.expired_tscdeadline == 0)
@@ -1645,7 +1645,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return 0;
 
@@ -1656,7 +1656,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return;
 
@@ -2001,7 +2001,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
 	struct hrtimer *timer;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
@@ -2174,7 +2174,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return 1;
 
 	/* if this is ICR write vector before command */
@@ -2188,7 +2188,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 low, high = 0;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return 1;
 
 	if (apic_reg_read(apic, reg, 4, &low))
@@ -2220,7 +2220,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	u8 sipi_vector;
 	unsigned long pe;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+	if (!lapic_in_kernel(vcpu) || !apic->pending_events)
 		return;
 
 	/*
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index afccf4099b00..59610099af04 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -103,7 +103,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
 
 extern struct static_key kvm_no_apic_vcpu;
 
-static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
 {
 	if (static_key_false(&kvm_no_apic_vcpu))
 		return vcpu->arch.apic;
@@ -130,7 +130,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
 
 static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+	return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
 }
 
 static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -150,7 +150,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+	return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
 }
 
 static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
@@ -161,7 +161,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
 
 static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
 static inline int kvm_apic_id(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 31aa2c85dc97..06ce377dcbc9 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.apic)
+	if (lapic_in_kernel(vcpu))
 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aafbcf9f9776..ee3e990d519a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2984,7 +2984,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-	    kvm_vcpu_has_lapic(vcpu))
+	    lapic_in_kernel(vcpu))
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -2997,7 +2997,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
 		else
 			vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-		if (kvm_vcpu_has_lapic(vcpu)) {
+		if (lapic_in_kernel(vcpu)) {
 			if (events->smi.latched_init)
 				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 			else
@@ -3237,7 +3237,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
-		if (!vcpu->arch.apic)
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
@@ -3255,7 +3255,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_LAPIC: {
 		r = -EINVAL;
-		if (!vcpu->arch.apic)
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
 		if (IS_ERR(u.lapic))
@@ -4090,7 +4090,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 
 	do {
 		n = min(len, 8);
-		if (!(vcpu->arch.apic &&
+		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
@@ -4110,7 +4110,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 
 	do {
 		n = min(len, 8);
-		if (!(vcpu->arch.apic &&
+		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
 					 addr, n, v))
 		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -6007,7 +6007,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	if (!kvm_x86_ops->update_cr8_intercept)
 		return;
 
-	if (!vcpu->arch.apic)
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (vcpu->arch.apicv_active)
@@ -7035,7 +7035,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	if (!kvm_vcpu_has_lapic(vcpu) &&
+	if (!lapic_in_kernel(vcpu) &&
 	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 		return -EINVAL;
 
@@ -7590,6 +7590,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-- 
cgit 


From 50d35015ff0c00a464e35b109231145d2beec1bd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 3 Feb 2016 15:04:40 +0000
Subject: KEYS: CONFIG_KEYS_DEBUG_PROC_KEYS is no longer an option

CONFIG_KEYS_DEBUG_PROC_KEYS is no longer an option as /proc/keys is now
mandatory if the keyrings facility is enabled (it's used by libkeyutils in
userspace).

The defconfig references were removed with:

	perl -p -i -e 's/CONFIG_KEYS_DEBUG_PROC_KEYS=y\n//' \
	    `git grep -l CONFIG_KEYS_DEBUG_PROC_KEYS=y`

and the integrity Kconfig fixed by hand.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Andreas Ziegler <andreas.ziegler@fau.de>
cc: Dmitry Kasatkin <dmitry.kasatkin@huawei.com>
---
 arch/x86/configs/i386_defconfig   | 1 -
 arch/x86/configs/x86_64_defconfig | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 028be48c8839..a212b68a4909 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -303,7 +303,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cb5b3ab5beec..4f404a64681b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -300,7 +300,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
-- 
cgit 


From e2c7698cd61f11d4077fdb28148b2d31b82ac848 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 15:51:16 +0100
Subject: x86/mm: Fix INVPCID asm constraint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So we want to specify the dependency on both @pcid and @addr so that the
compiler doesn't reorder accesses to them *before* the TLB flush. But
for that to work, we need to express this properly in the inline asm and
deref the whole desc array, not the pointer to it. See clwb() for an
example.

This fixes the build error on 32-bit:

  arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’:
  arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable

which gcc4.7 caught but 5.x didn't. Which is strange. :-\

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Michael Matz <matz@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc9a2fda1404..d0cce90b0855 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,7 +10,7 @@
 static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
-	u64 desc[2] = { pcid, addr };
+	struct { u64 d[2]; } desc = { { pcid, addr } };
 
 	/*
 	 * The memory clobber is because the whole point is to invalidate
@@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 	 * invpcid (%rcx), %rax in long mode.
 	 */
 	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
 }
 
 #define INVPCID_TYPE_INDIV_ADDR		0
-- 
cgit 


From 2cfec6a2f989d5c921ba11a329ff8ea986702b9b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 11 Feb 2016 16:10:25 -0500
Subject: xen/pcifront: Report the errors better.

The messages should be different depending on the type of error.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/include/asm/xen/pci.h | 4 ++--
 arch/x86/pci/xen.c             | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 968d57dd54c9..f320ee32d5a1 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -57,7 +57,7 @@ static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
 		return xen_pci_frontend->enable_msi(dev, vectors);
-	return -ENODEV;
+	return -ENOSYS;
 }
 static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
 {
@@ -69,7 +69,7 @@ static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
 		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
-	return -ENODEV;
+	return -ENOSYS;
 }
 static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
 {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index ff31ab464213..beac4dfdade6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -196,7 +196,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	return 0;
 
 error:
-	dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+	if (ret == -ENOSYS)
+		dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+	else if (ret)
+		dev_err(&dev->dev, "Xen PCI frontend error: %d!\n", ret);
 free:
 	kfree(v);
 	return ret;
-- 
cgit 


From ed07247dbf5b0a23b0289c9e8e4a2ceb9b8f2e9d Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 2 Feb 2016 13:53:30 -0600
Subject: gpio: Remove unused asm/gpio.h files

asm/gpio.h is included only by linux/gpio.h, and then only when the arch
selects ARCH_HAVE_CUSTOM_GPIO_H.  Only the following arches select it: arm
avr32 blackfin m68k (COLDFIRE only) sh unicore32.

Remove the unused asm/gpio.h files for the arches that do not select
ARCH_HAVE_CUSTOM_GPIO_H.

This is a follow-on to 7563bbf89d06 ("gpiolib/arches: Centralise
bolierplate asm/gpio.h").

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/x86/include/asm/gpio.h | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 arch/x86/include/asm/gpio.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
deleted file mode 100644
index b3799d88ffcf..000000000000
--- a/arch/x86/include/asm/gpio.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __LINUX_GPIO_H
-#warning Include linux/gpio.h instead of asm/gpio.h
-#include <linux/gpio.h>
-#endif
-- 
cgit 


From f2cc8e0791c70833758101d9756609a08dd601ec Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 00:19:18 +0100
Subject: x86/cpufeature: Speed up cpu_feature_enabled()

When GCC cannot do constant folding for this macro, it falls back to
cpu_has(). But static_cpu_has() is optimal and it works at all times
now. So use it and speedup the fallback case.

Before we had this:

  mov    0x99d674(%rip),%rdx        # ffffffff81b0d9f4 <boot_cpu_data+0x34>
  shr    $0x2e,%rdx
  and    $0x1,%edx
  jne    ffffffff811704e9 <do_munmap+0x3f9>

After alternatives patching, it turns into:

		  jmp    0xffffffff81170390
		  nopl   (%rax)
		  ...
		  callq  ffffffff81056e00 <mpx_notify_unmap>
ffffffff81170390: mov    0x170(%r12),%rdi

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455578358-28347-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9fba7a5dd24a..68e4e8258b84 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -88,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * is not relevant.
  */
 #define cpu_feature_enabled(bit)	\
-	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 :	\
-	 cpu_has(&boot_cpu_data, bit))
+	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
-- 
cgit 


From 7f5301b7e66a1fd096b5d10dbb0bb2a8832516b4 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:52 -0500
Subject: x86/platform: Make platform/intel-quark/imr.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  drivers/platform/x86/Kconfig:config INTEL_IMR
  drivers/platform/x86/Kconfig:   bool "Intel Isolated Memory Region support"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code.

We also delete the MODULE_LICENSE tag etc. since all that information
was (or is now) contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Reviewed-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-2-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr.c | 35 ++---------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index c61b6c332e97..0a3736f03edc 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
 /**
- * imr.c
+ * imr.c -- Intel Isolated Memory Region driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -31,7 +31,6 @@
 #include <linux/debugfs.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 struct imr_device {
@@ -269,17 +268,6 @@ static int imr_debugfs_register(struct imr_device *idev)
 	return PTR_ERR_OR_ZERO(idev->file);
 }
 
-/**
- * imr_debugfs_unregister - unregister debugfs hooks.
- *
- * @idev:	pointer to imr_device structure.
- * @return:
- */
-static void imr_debugfs_unregister(struct imr_device *idev)
-{
-	debugfs_remove(idev->file);
-}
-
 /**
  * imr_check_params - check passed address range IMR alignment and non-zero size
  *
@@ -614,7 +602,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
 	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
 	{}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_init - entry point for IMR driver.
@@ -640,22 +627,4 @@ static int __init imr_init(void)
 	imr_fixup_memmap(idev);
 	return 0;
 }
-
-/**
- * imr_exit - exit point for IMR code.
- *
- * Deregisters debugfs, leave IMR state as-is.
- *
- * return:
- */
-static void __exit imr_exit(void)
-{
-	imr_debugfs_unregister(&imr_dev);
-}
-
-module_init(imr_init);
-module_exit(imr_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_init);
-- 
cgit 


From 32ed42ad6c42d6e0e78914a25c216fbbd595e74d Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:53 -0500
Subject: x86/platform: Make platform/intel-quark/imr_selftest.c explicitly
 non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig.debug:config DEBUG_IMR_SELFTEST
  arch/x86/Kconfig.debug:    bool "Isolated Memory Region self test"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code.

We also delete the MODULE_LICENSE tag etc. since all that information
was (or is now) contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Reviewed-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-3-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr_selftest.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 278e4da4222f..0381343a0d3a 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
 /**
- * imr_selftest.c
+ * imr_selftest.c -- Intel Isolated Memory Region self-test driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -15,7 +15,6 @@
 #include <asm/imr.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 #define SELFTEST KBUILD_MODNAME ": "
@@ -106,7 +105,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
 	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
 	{}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_self_test_init - entry point for IMR driver.
@@ -125,13 +123,4 @@ static int __init imr_self_test_init(void)
  *
  * return:
  */
-static void __exit imr_self_test_exit(void)
-{
-}
-
-module_init(imr_self_test_init);
-module_exit(imr_self_test_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_self_test_init);
-- 
cgit 


From eb61aee743ec647f22071479a5137241a7dcab17 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:54 -0500
Subject: x86/platform: Make platform/geode/geos.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config GEOS
  arch/x86/Kconfig:       bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modularity, so that when reading
the code there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Prindeville <philipp@redfish-solutions.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-4-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/geode/geos.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index aa733fba2471..4fcdb91318a0 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -19,7 +19,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -120,9 +119,4 @@ static int __init geos_init(void)
 
 	return 0;
 }
-
-module_init(geos_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(geos_init);
-- 
cgit 


From 52d856e88171e1ec000d0479363c1e4e81991130 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:55 -0500
Subject: x86/platform: Make platform/geode/alix.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config ALIX
  arch/x86/Kconfig:       bool "PCEngines ALIX System Support (LED setup)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We replace module.h with moduleparam.h since the file does declare
some module parameters, and leaving them as such is currently the
easiest way to remain compatible with existing boot arg use cases.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Ed Wildgoose <kernel@wildgooses.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-5-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/geode/alix.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index 76b6632d3143..1865c196f136 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -35,6 +35,11 @@
 #define BIOS_SIGNATURE_COREBOOT		0x500
 #define BIOS_REGION_SIZE		0x10000
 
+/*
+ * This driver is not modular, but to keep back compatibility
+ * with existing use cases, continuing with module_param is
+ * the easiest way forward.
+ */
 static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
@@ -192,9 +197,4 @@ static int __init alix_init(void)
 
 	return 0;
 }
-
-module_init(alix_init);
-
-MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>");
-MODULE_DESCRIPTION("PCEngines ALIX System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(alix_init);
-- 
cgit 


From 605a46ee8353e8292e93baa5dc13e6be98bbec43 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:56 -0500
Subject: x86/platform: Make platform/geode/net5501.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config NET5501
  arch/x86/Kconfig:       bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modularity, so that when reading
the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Prindeville <philipp@redfish-solutions.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-6-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/geode/net5501.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 927e38c0089f..a2f6b982a729 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,7 +20,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -146,9 +145,4 @@ static int __init net5501_init(void)
 
 	return 0;
 }
-
-module_init(net5501_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Soekris net5501 System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(net5501_init);
-- 
cgit 


From 25b4caf7c50e8c501310e8c515d8518b1850c948 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Sun, 14 Feb 2016 13:35:58 +0100
Subject: x86/boot: Remove unused 'is_big_kernel' variable

Variable is_big_kernel is defined in arch/x86/boot/tools/build.c but
never used anywhere.

Boris noted that its usage went away 7 years ago, as of:

  5e47c478b0b6 ("x86: remove zImage support")

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455453358-4088-1-git-send-email-nicolas.iooss_linux@m4x.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/tools/build.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a7661c430cd9..0702d2531bc7 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -49,7 +49,6 @@ typedef unsigned int   u32;
 
 /* This must be large enough to hold the entire setup */
 u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
 
 #define PECOFF_RELOC_RESERVE 0x20
 
-- 
cgit 


From d4edcf0d56958db0aca0196314ca38a5e730ea92 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:01:56 -0800
Subject: mm/gup: Switch all callers of get_user_pages() to not pass tsk/mm

We will soon modify the vanilla get_user_pages() so it can no
longer be used on mm/tasks other than 'current/current->mm',
which is by far the most common way it is called.  For now,
we allow the old-style calls, but warn when they are used.
(implemented in previous patch)

This patch switches all callers of:

	get_user_pages()
	get_user_pages_unlocked()
	get_user_pages_locked()

to stop passing tsk/mm so they will no longer see the warnings.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: jack@suse.cz
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210156.113E9407@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/gup.c | 2 +-
 arch/x86/mm/mpx.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6d5eb5900372..ce5e4545203b 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -422,7 +422,7 @@ slow_irqon:
 		start += nr << PAGE_SHIFT;
 		pages += nr;
 
-		ret = get_user_pages_unlocked(current, mm, start,
+		ret = get_user_pages_unlocked(start,
 					      (end - start) >> PAGE_SHIFT,
 					      write, 0, pages);
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b2fd67da1701..84fa4a482c78 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
 	int nr_pages = 1;
 	int force = 0;
 
-	gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
-				 nr_pages, write, force, NULL, NULL);
+	gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
+			force, NULL, NULL);
 	/*
 	 * get_user_pages() returns number of pages gotten.
 	 * 0 means we failed to fault in and get anything,
-- 
cgit 


From 1f96b1efbad4bb753e7fd265753f6cac1cdc5648 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:01:58 -0800
Subject: x86/fpu: Add placeholder for 'Processor Trace' XSAVE state

There is an XSAVE state component for Intel Processor Trace (PT).
But, we do not currently use it.

We add a placeholder in the code for it so it is not a mystery and
also so we do not need an explicit enum initialization for Protection
Keys in a moment.

Why don't we use it?

We might end up using this at _some_ point in the future.  But,
this is a "system" state which requires using the currently
unsupported XSAVES feature.  Unlike all the other XSAVE states,
PT state is also not directly tied to a thread.  You might
context-switch between threads, but not want to change any of the
PT state.  Or, you might switch between threads, and *do* want to
change PT state, all depending on what is being traced.

We currently just manually set some MSRs to do this PT context
switching, and it is unclear whether replacing our direct MSR use
with XSAVE will be a net win or loss, both in code complexity and
performance.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: fenghua.yu@intel.com
Cc: linux-mm@kvack.org
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20160212210158.5E4BCAE2@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h |  1 +
 arch/x86/kernel/fpu/xstate.c     | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac52ad0..aad3181c9b7b 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -108,6 +108,7 @@ enum xfeature {
 	XFEATURE_OPMASK,
 	XFEATURE_ZMM_Hi256,
 	XFEATURE_Hi16_ZMM,
+	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 
 	XFEATURE_MAX,
 };
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d425cda5ae6d..c2e2349dbcfe 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -13,6 +13,11 @@
 
 #include <asm/tlbflush.h>
 
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused.  We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
 static const char *xfeature_names[] =
 {
 	"x87 floating point registers"	,
@@ -23,7 +28,7 @@ static const char *xfeature_names[] =
 	"AVX-512 opmask"		,
 	"AVX-512 Hi256"			,
 	"AVX-512 ZMM_Hi256"		,
-	"unknown xstate feature"	,
+	"Processor Trace (unused)"	,
 };
 
 /*
@@ -470,7 +475,8 @@ static void check_xstate_against_struct(int nr)
 	 * numbers.
 	 */
 	if ((nr < XFEATURE_YMM) ||
-	    (nr >= XFEATURE_MAX)) {
+	    (nr >= XFEATURE_MAX) ||
+	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
 	}
-- 
cgit 


From 35e97790f5f1e5cf2b5522c55e3e31d5c81bd226 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:00 -0800
Subject: x86/mm/pkeys: Add Kconfig option

I don't have a strong opinion on whether we need a Kconfig prompt
or not.  Protection Keys has relatively little code associated
with it, and it is not a heavyweight feature to keep enabled.
However, I can imagine that folks would still appreciate being
able to disable it.

Note that, with disabled-features.h, the checks in the code
for protection keys are always the same:

	cpu_has(c, X86_FEATURE_PKU)

With the config option disabled, this essentially turns into an

We will hide the prompt for now.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210200.DB7055E8@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab2ed5328f0a..3632cdd03201 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1714,6 +1714,10 @@ config X86_INTEL_MPX
 
 	  If unsure, say N.
 
+config X86_INTEL_MEMORY_PROTECTION_KEYS
+	def_bool y
+	depends on CPU_SUP_INTEL && X86_64
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
-- 
cgit 


From dfb4a70f20c5b3880da56ee4c9484bdb4e8f1e65 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:01 -0800
Subject: x86/cpufeature, x86/mm/pkeys: Add protection keys related CPUID
 definitions

There are two CPUID bits for protection keys.  One is for whether
the CPU contains the feature, and the other will appear set once
the OS enables protection keys.  Specifically:

	Bit 04: OSPKE. If 1, OS has set CR4.PKE to enable
	Protection keys (and the RDPKRU/WRPKRU instructions)

This is because userspace can not see CR4 contents, but it can
see CPUID contents.

X86_FEATURE_PKU is referred to as "PKU" in the hardware documentation:

	CPUID.(EAX=07H,ECX=0H):ECX.PKU [bit 3]

X86_FEATURE_OSPKE is "OSPKU":

	CPUID.(EAX=07H,ECX=0H):ECX.OSPKE [bit 4]

These are the first CPU features which need to look at the
ECX word in CPUID leaf 0x7, so this patch also includes
fetching that word in to the cpuinfo->x86_capability[] array.

Add it to the disabled-features mask when its config option is
off.  Even though we are not using it here, we also extend the
REQUIRED_MASK_BIT_SET() macro to keep it mirroring the
DISABLED_MASK_BIT_SET() version.

This means that in almost all code, you should use:

	cpu_has(c, X86_FEATURE_PKU)

and *not* the CONFIG option.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210201.7714C250@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h        | 59 +++++++++++++++++++++-----------
 arch/x86/include/asm/cpufeatures.h       |  2 +-
 arch/x86/include/asm/disabled-features.h | 15 ++++++++
 arch/x86/include/asm/required-features.h |  7 ++++
 arch/x86/kernel/cpu/common.c             |  1 +
 5 files changed, 63 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 68e4e8258b84..50e292a73a7b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -26,6 +26,7 @@ enum cpuid_leafs
 	CPUID_8000_0008_EBX,
 	CPUID_6_EAX,
 	CPUID_8000_000A_EDX,
+	CPUID_7_ECX,
 };
 
 #ifdef CONFIG_X86_FEATURE_NAMES
@@ -48,28 +49,42 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	 test_bit(bit, (unsigned long *)((c)->x86_capability))
 
 #define REQUIRED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||	\
-	   (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||	\
-	   (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||	\
-	   (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) ||	\
-	   (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||	\
-	   (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||	\
-	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
-	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||	\
-	   (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||	\
-	   (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
 
 #define DISABLED_MASK_BIT_SET(bit)					\
-	 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) ||	\
-	   (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) ||	\
-	   (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) ||	\
-	   (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) ||	\
-	   (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) ||	\
-	   (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) ||	\
-	   (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) ||	\
-	   (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) ||	\
-	   (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) ||	\
-	   (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) )
+	 ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||	\
+	   (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||	\
+	   (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||	\
+	   (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||	\
+	   (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||	\
+	   (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||	\
+	   (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||	\
+	   (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||	\
+	   (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||	\
+	   (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||	\
+	   (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||	\
+	   (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||	\
+	   (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||	\
+	   (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||	\
+	   (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
 
 #define cpu_has(c, bit)							\
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
@@ -79,6 +94,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
 	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
 
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+
 /*
  * This macro is for detection of features which need kernel
  * infrastructure to be used.  It may *not* directly test the CPU
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0ceb6adc8a48..cbb2c568a19e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
 /*
  * Defines x86 CPU feature bits
  */
-#define NCAPINTS	16	/* N 32-bit words worth of info */
+#define NCAPINTS	17	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..39343be7d4f4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -28,6 +28,14 @@
 # define DISABLE_CENTAUR_MCR	0
 #endif /* CONFIG_X86_64 */
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+# define DISABLE_PKU		(1<<(X86_FEATURE_PKU))
+# define DISABLE_OSPKE		(1<<(X86_FEATURE_OSPKE))
+#else
+# define DISABLE_PKU		0
+# define DISABLE_OSPKE		0
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -41,5 +49,12 @@
 #define DISABLED_MASK7	0
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
+#define DISABLED_MASK10	0
+#define DISABLED_MASK11	0
+#define DISABLED_MASK12	0
+#define DISABLED_MASK13	0
+#define DISABLED_MASK14	0
+#define DISABLED_MASK15	0
+#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE)
 
 #endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 5c6e4fb370f5..4916144e3c42 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -92,5 +92,12 @@
 #define REQUIRED_MASK7	0
 #define REQUIRED_MASK8	0
 #define REQUIRED_MASK9	0
+#define REQUIRED_MASK10	0
+#define REQUIRED_MASK11	0
+#define REQUIRED_MASK12	0
+#define REQUIRED_MASK13	0
+#define REQUIRED_MASK14	0
+#define REQUIRED_MASK15	0
+#define REQUIRED_MASK16	0
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f4bb2c420969..a719ad7551d2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -627,6 +627,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_7_0_EBX] = ebx;
 
 		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+		c->x86_capability[CPUID_7_ECX] = ecx;
 	}
 
 	/* Extended state features: level 0x0000000d */
-- 
cgit 


From f28b49d2bcdb9ef9e771b3d6750f40be9d453316 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:02 -0800
Subject: x86/cpu, x86/mm/pkeys: Define new CR4 bit

There is a new bit in CR4 for enabling protection keys.  We
will actually enable it later in the series.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210202.3CFC3DB2@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/processor-flags.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..567de50a4c2a 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -118,6 +118,8 @@
 #define X86_CR4_SMEP		_BITUL(X86_CR4_SMEP_BIT)
 #define X86_CR4_SMAP_BIT	21 /* enable SMAP support */
 #define X86_CR4_SMAP		_BITUL(X86_CR4_SMAP_BIT)
+#define X86_CR4_PKE_BIT		22 /* enable Protection Keys support */
+#define X86_CR4_PKE		_BITUL(X86_CR4_PKE_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
-- 
cgit 


From c8df40098451ba18a43f22b563c9129182353158 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:04 -0800
Subject: x86/fpu, x86/mm/pkeys: Add PKRU xsave fields and data structures

The protection keys register (PKRU) is saved and restored using
xsave.  Define the data structure that we will use to access it
inside the xsave buffer.

Note that we also have to widen the printk of the xsave feature
masks since this is feature 0x200 and we only did two characters
before.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210204.56DF8F7B@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h  | 11 +++++++++++
 arch/x86/include/asm/fpu/xstate.h |  3 ++-
 arch/x86/kernel/fpu/xstate.c      |  7 ++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index aad3181c9b7b..36b90bbfc69f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -109,6 +109,7 @@ enum xfeature {
 	XFEATURE_ZMM_Hi256,
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
+	XFEATURE_PKRU,
 
 	XFEATURE_MAX,
 };
@@ -121,6 +122,7 @@ enum xfeature {
 #define XFEATURE_MASK_OPMASK		(1 << XFEATURE_OPMASK)
 #define XFEATURE_MASK_ZMM_Hi256		(1 << XFEATURE_ZMM_Hi256)
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
@@ -213,6 +215,15 @@ struct avx_512_hi16_state {
 	struct reg_512_bit		hi16_zmm[16];
 } __packed;
 
+/*
+ * State component 9: 32-bit PKRU register.  The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+	u32				pkru;
+	u32				pad;
+} __packed;
+
 struct xstate_header {
 	u64				xfeatures;
 	u64				xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index af30fdeb140d..9994d4229002 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -28,7 +28,8 @@
 				 XFEATURE_MASK_YMM | \
 				 XFEATURE_MASK_OPMASK | \
 				 XFEATURE_MASK_ZMM_Hi256 | \
-				 XFEATURE_MASK_Hi16_ZMM)
+				 XFEATURE_MASK_Hi16_ZMM	 | \
+				 XFEATURE_MASK_PKRU)
 
 /* All currently supported features */
 #define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c2e2349dbcfe..a63ca80fa723 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -29,6 +29,8 @@ static const char *xfeature_names[] =
 	"AVX-512 Hi256"			,
 	"AVX-512 ZMM_Hi256"		,
 	"Processor Trace (unused)"	,
+	"Protection Keys User registers",
+	"unknown xstate feature"	,
 };
 
 /*
@@ -58,6 +60,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
 	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+	setup_clear_cpu_cap(X86_FEATURE_PKU);
 }
 
 /*
@@ -236,7 +239,7 @@ static void __init print_xstate_feature(u64 xstate_mask)
 	const char *feature_name;
 
 	if (cpu_has_xfeatures(xstate_mask, &feature_name))
-		pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
 }
 
 /*
@@ -252,6 +255,7 @@ static void __init print_xstate_features(void)
 	print_xstate_feature(XFEATURE_MASK_OPMASK);
 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+	print_xstate_feature(XFEATURE_MASK_PKRU);
 }
 
 /*
@@ -468,6 +472,7 @@ static void check_xstate_against_struct(int nr)
 	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
 	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
 	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
+	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
 
 	/*
 	 * Make *SURE* to add any feature numbers in below if
-- 
cgit 


From 5bb16016ce111f3f4bc68bb109f7e5be6bc14ad7 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 9 Feb 2016 20:14:21 +0100
Subject: KVM: VMX: Factor out is_exception_n helper

There is quite some common code in all these is_<exception>() helpers.
Factor it out before adding even more of them.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 164eb9e1678b..7852092b82ae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -961,25 +961,26 @@ static const u32 vmx_msr_index[] = {
 	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, PF_VECTOR);
 }
 
 static inline bool is_no_device(u32 intr_info)
 {
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+	return is_exception_n(intr_info, NM_VECTOR);
 }
 
 static inline bool is_invalid_opcode(u32 intr_info)
 {
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+	return is_exception_n(intr_info, UD_VECTOR);
 }
 
 static inline bool is_external_interrupt(u32 intr_info)
-- 
cgit 


From 6f05485d3a161caebc0fc7a73d641fa42a0dd263 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 9 Feb 2016 20:15:18 +0100
Subject: KVM: VMX: Fix guest debugging while in L2

When we take a #DB or #BP vmexit while in guest mode, we first of all
need to check if there is ongoing guest debugging that might be
interested in the event. Currently, we unconditionally leave L2 and
inject the event into L1 if it is intercepting the exceptions. That
breaks things marvelously.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7852092b82ae..cb501d306416 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -968,6 +968,16 @@ static inline bool is_exception_n(u32 intr_info, u8 vector)
 		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
 }
 
+static inline bool is_debug(u32 intr_info)
+{
+	return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+	return is_exception_n(intr_info, BP_VECTOR);
+}
+
 static inline bool is_page_fault(u32 intr_info)
 {
 	return is_exception_n(intr_info, PF_VECTOR);
@@ -7753,6 +7763,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		else if (is_no_device(intr_info) &&
 			 !(vmcs12->guest_cr0 & X86_CR0_TS))
 			return false;
+		else if (is_debug(intr_info) &&
+			 vcpu->guest_debug &
+			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return false;
+		else if (is_breakpoint(intr_info) &&
+			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return false;
 		return vmcs12->exception_bitmap &
 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
 	case EXIT_REASON_EXTERNAL_INTERRUPT:
-- 
cgit 


From 4941b8cb3746f09bb102f7a5d64d878e96a0c6cd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Feb 2016 14:51:12 +0100
Subject: KVM: x86: rename argument to kvm_set_tsc_khz

This refers to the desired (scaled) frequency, which is called
user_tsc_khz in the rest of the file.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ee3e990d519a..16bbe6df679b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1290,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 	return 0;
 }
 
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 {
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 
 	/* tsc_khz can be zero if TSC calibration fails */
-	if (this_tsc_khz == 0) {
+	if (user_tsc_khz == 0) {
 		/* set tsc_scaling_ratio to a safe value */
 		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		return -1;
 	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
-	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+	kvm_get_time_scale(user_tsc_khz, NSEC_PER_SEC / 1000,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
-	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 
 	/*
 	 * Compute the variation in TSC rate which is acceptable
@@ -1316,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 	 */
 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
 	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
 		use_scaling = 1;
 	}
-	return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
-- 
cgit 


From 78db6a5037965429c04d708281f35a6e5562d31b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Feb 2016 14:51:40 +0100
Subject: KVM: x86: rewrite handling of scaled TSC for kvmclock

This is the same as before:

    kvm_scale_tsc(tgt_tsc_khz)
        = tgt_tsc_khz * ratio
        = tgt_tsc_khz * user_tsc_khz / tsc_khz   (see set_tsc_khz)
        = user_tsc_khz                           (see kvm_guest_time_update)
        = vcpu->arch.virtual_tsc_khz             (see kvm_set_tsc_khz)

However, computing it through kvm_scale_tsc will make it possible
to include the NTP correction in tgt_tsc_khz.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 16bbe6df679b..01d22b37556c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1713,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+	unsigned long flags, tgt_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
@@ -1739,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-	if (unlikely(this_tsc_khz == 0)) {
+	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+	if (unlikely(tgt_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
@@ -1775,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (!vcpu->pv_time_enabled)
 		return 0;
 
-	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		tgt_tsc_khz = kvm_has_tsc_control ?
-			vcpu->virtual_tsc_khz : this_tsc_khz;
+	if (kvm_has_tsc_control)
+		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
 		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
-		vcpu->hw_tsc_khz = this_tsc_khz;
+		vcpu->hw_tsc_khz = tgt_tsc_khz;
 	}
 
 	/* With all the info we got, fill in the values */
-- 
cgit 


From 4efd805fca5590af181f89ad122b8695c63b5f2c Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 12 Feb 2016 15:00:15 +0100
Subject: KVM: x86: fix *NULL on invalid low-prio irq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smatch noticed a NULL dereference in kvm_intr_is_single_vcpu_fast that
happens if VM already warned about invalid lowest-priority interrupt.

Create a function for common code while fixing it.

Fixes: 6228a0da8057 ("KVM: x86: Add lowest-priority support for vt-d posted-interrupts")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1482a581a83c..cf74404230ca 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -685,6 +685,15 @@ int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
 	return idx;
 }
 
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+	if (!kvm->arch.disabled_lapic_found) {
+		kvm->arch.disabled_lapic_found = true;
+		printk(KERN_INFO
+		       "Disabled LAPIC found during irq injection\n");
+	}
+}
+
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
 {
@@ -763,15 +772,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 			idx = kvm_vector_to_index(irq->vector,
 				dest_vcpus, &bitmap, 16);
 
-			/*
-			 * We may find a hardware disabled LAPIC here, if that
-			 * is the case, print out a error message once for each
-			 * guest and return.
-			 */
-			if (!dst[idx] && !kvm->arch.disabled_lapic_found) {
-				kvm->arch.disabled_lapic_found = true;
-				printk(KERN_INFO
-					"Disabled LAPIC found during irq injection\n");
+			if (!dst[idx]) {
+				kvm_apic_disabled_lapic_found(kvm);
 				goto out;
 			}
 
@@ -859,16 +861,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			idx = kvm_vector_to_index(irq->vector, dest_vcpus,
 						  &bitmap, 16);
 
-			/*
-			 * We may find a hardware disabled LAPIC here, if that
-			 * is the case, print out a error message once for each
-			 * guest and return
-			 */
 			dst = map->logical_map[cid][idx];
-			if (!dst && !kvm->arch.disabled_lapic_found) {
-				kvm->arch.disabled_lapic_found = true;
-				printk(KERN_INFO
-					"Disabled LAPIC found during irq injection\n");
+			if (!dst) {
+				kvm_apic_disabled_lapic_found(kvm);
 				goto out;
 			}
 
-- 
cgit 


From 4e422bdd2f849d98fffccbc3295c2f0996097fb3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 10 Feb 2016 17:50:23 +0100
Subject: KVM: x86: fix missed hardware breakpoints

Sometimes when setting a breakpoint a process doesn't stop on it.
This is because the debug registers are not loaded correctly on
VCPU load.

The following simple reproducer from Oleg Nesterov tries using debug
registers in both the host and the guest, for example by running "./bp
0 1" on the host and "./bp 14 15" under QEMU.

    #include <unistd.h>
    #include <signal.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <sys/wait.h>
    #include <sys/ptrace.h>
    #include <sys/user.h>
    #include <asm/debugreg.h>
    #include <assert.h>

    #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)

    unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len)
    {
        unsigned long dr7;

        dr7 = ((len | type) & 0xf)
            << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
        if (enable)
            dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));

        return dr7;
    }

    int write_dr(int pid, int dr, unsigned long val)
    {
        return ptrace(PTRACE_POKEUSER, pid,
                offsetof (struct user, u_debugreg[dr]),
                val);
    }

    void set_bp(pid_t pid, void *addr)
    {
        unsigned long dr7;
        assert(write_dr(pid, 0, (long)addr) == 0);
        dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1);
        assert(write_dr(pid, 7, dr7) == 0);
    }

    void *get_rip(int pid)
    {
        return (void*)ptrace(PTRACE_PEEKUSER, pid,
                offsetof(struct user, regs.rip), 0);
    }

    void test(int nr)
    {
        void *bp_addr = &&label + nr, *bp_hit;
        int pid;

        printf("test bp %d\n", nr);
        assert(nr < 16); // see 16 asm nops below

        pid = fork();
        if (!pid) {
            assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
            kill(getpid(), SIGSTOP);
            for (;;) {
                label: asm (
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                );
            }
        }

        assert(pid == wait(NULL));
        set_bp(pid, bp_addr);

        for (;;) {
            assert(ptrace(PTRACE_CONT, pid, 0, 0) == 0);
            assert(pid == wait(NULL));

            bp_hit = get_rip(pid);
            if (bp_hit != bp_addr)
                fprintf(stderr, "ERR!! hit wrong bp %ld != %d\n",
                    bp_hit - &&label, nr);
        }
    }

    int main(int argc, const char *argv[])
    {
        while (--argc) {
            int nr = atoi(*++argv);
            if (!fork())
                test(nr);
        }

        while (wait(NULL) > 0)
            ;
        return 0;
    }

Cc: stable@vger.kernel.org
Suggested-by: Nadadv Amit <namit@cs.technion.ac.il>
Reported-by: Andrey Wagin <avagin@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 01d22b37556c..94ef72dce299 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2750,6 +2750,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+	vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-- 
cgit 


From 8ed6d76781dd01451546d402a2136b6389861d81 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:44:57 +0300
Subject: kvm/x86: Rename Hyper-V long spin wait hypercall

Rename HV_X64_HV_NOTIFY_LONG_SPIN_WAIT by HVCALL_NOTIFY_LONG_SPIN_WAIT,
so the name is more consistent with the other hypercalls.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
[Change name, Andrey used HV_X64_HCALL_NOTIFY_LONG_SPIN_WAIT. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/hyperv.h | 2 +-
 arch/x86/kvm/hyperv.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 7956412d09bd..5699e1c94693 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -226,7 +226,7 @@
 		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
 
 /* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
 
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c58ba67175ac..31222442ac57 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1084,7 +1084,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
 	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
 	default:
-- 
cgit 


From 18f098618aa031f4c8a907c550fcd6785280c977 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:44:58 +0300
Subject: drivers/hv: Move VMBus hypercall codes into Hyper-V UAPI header

VMBus hypercall codes inside Hyper-V UAPI header will
be used by QEMU to implement VMBus host devices support.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Acked-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
[Do not rename the constant at the same time as moving it, as that
 would cause semantic conflicts with the Hyper-V tree. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/hyperv.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 5699e1c94693..9b1a91834ac8 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -227,6 +227,8 @@
 
 /* Declare the various hypercall operations. */
 #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_POST_MESSAGE			0x005c
+#define HVCALL_SIGNAL_EVENT			0x005d
 
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12
-- 
cgit 


From 0d9c055eaaf41bebb0e6b095fff447523121fad3 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:44:59 +0300
Subject: kvm/x86: Pass return code of kvm_emulate_hypercall

Pass the return code from kvm_emulate_hypercall on to the caller,
in order to allow it to indicate to the userspace that
the hypercall has to be handled there.

Also adjust all the existing code paths to return 1 to make sure the
hypercall isn't passed to the userspace without setting kvm_run
appropriately.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 2 +-
 arch/x86/kvm/svm.c    | 3 +--
 arch/x86/kvm/vmx.c    | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 31222442ac57..599b06733f02 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1055,7 +1055,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
+		return 1;
 	}
 
 	longmode = is_64_bit_mode(vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c13a64b7d789..95070386d599 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1858,8 +1858,7 @@ static int halt_interception(struct vcpu_svm *svm)
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	kvm_emulate_hypercall(&svm->vcpu);
-	return 1;
+	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cb501d306416..9f08037ef14b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5758,8 +5758,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_hypercall(vcpu);
-	return 1;
+	return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
-- 
cgit 


From b2fdc2570a6c4b1fe950c11a2e9ce949f5190765 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:45:00 +0300
Subject: kvm/x86: Reject Hyper-V hypercall continuation

Currently we do not support Hyper-V hypercall continuation
so reject it.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 599b06733f02..e8af5978762b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1083,6 +1083,12 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
+	/* Hypercall continuation is not supported yet */
+	if (rep_cnt || rep_idx) {
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		goto set_result;
+	}
+
 	switch (code) {
 	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
@@ -1092,6 +1098,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		break;
 	}
 
+set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
 	if (longmode) {
 		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-- 
cgit 


From 83326e43f27e9a8a501427a0060f8af519a39bb2 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:45:01 +0300
Subject: kvm/x86: Hyper-V VMBus hypercall userspace exit

The patch implements KVM_EXIT_HYPERV userspace exit
functionality for Hyper-V VMBus hypercalls:
HV_X64_HCALL_POST_MESSAGE, HV_X64_HCALL_SIGNAL_EVENT.

Changes v3:
* use vcpu->arch.complete_userspace_io to setup hypercall
result

Changes v2:
* use KVM_EXIT_HYPERV for hypercalls

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e8af5978762b..5ff3485acb60 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+	bool longmode;
+
+	longmode = is_64_bit_mode(vcpu);
+	if (longmode)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+	else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+	}
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+	return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -1093,6 +1114,16 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
+	case HVCALL_POST_MESSAGE:
+	case HVCALL_SIGNAL_EVENT:
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
@@ -1100,12 +1131,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
+	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
-- 
cgit 


From 3ae13faac40011e51234989d938fb70f4f0150d0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Feb 2016 15:11:15 +0100
Subject: KVM: x86: pass kvm_get_time_scale arguments in hertz

Prepare for improving the precision in the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94ef72dce299..2fb92c0af803 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1203,7 +1203,7 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 	return dividend;
 }
 
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 			       s8 *pshift, u32 *pmultiplier)
 {
 	uint64_t scaled64;
@@ -1211,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = base_khz * 1000LL;
-	scaled64 = scaled_khz * 1000LL;
+	tps64 = base_hz;
+	scaled64 = scaled_hz;
 	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
@@ -1230,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	*pshift = shift;
 	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
+	pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
 #ifdef CONFIG_X86_64
@@ -1303,7 +1303,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
-	kvm_get_time_scale(user_tsc_khz, NSEC_PER_SEC / 1000,
+	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
 	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
@@ -1779,7 +1779,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
 
 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
-		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = tgt_tsc_khz;
-- 
cgit 


From 28856a9e52c7cac712af6c143de04766617535dc Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Tue, 9 Feb 2016 15:37:47 +0100
Subject: crypto: xts - consolidate sanity check for keys

The patch centralizes the XTS key check logic into the service function
xts_check_key which is invoked from the different XTS implementations.
With this, the XTS implementations in ARM, ARM64, PPC and S390 have now
a sanity check for the XTS keys similar to the other arches.

In addition, this service function received a check to ensure that the
key != the tweak key which is mandated by FIPS 140-2 IG A.9. As the
check is not present in the standards defining XTS, it is only enforced
in FIPS mode of the kernel.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c  | 11 +++--------
 arch/x86/crypto/camellia_glue.c     | 10 +++-------
 arch/x86/crypto/cast6_avx_glue.c    | 10 +++-------
 arch/x86/crypto/serpent_avx_glue.c  | 11 +++--------
 arch/x86/crypto/serpent_sse2_glue.c | 11 +++--------
 arch/x86/crypto/twofish_glue_3way.c | 10 +++-------
 6 files changed, 18 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6145c5..064c7e2bd7c8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -639,16 +639,11 @@ static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key,
 			    unsigned int keylen)
 {
 	struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5c8b6266a394..aa76cad9d262 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1503,13 +1503,9 @@ int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
 	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index fca459578c35..50e684768c55 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -329,13 +329,9 @@ static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
 	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 5dc37026c7ce..6f778d3daa22 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -332,16 +332,11 @@ int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 		       unsigned int keylen)
 {
 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 3643dd508f45..8943407e8917 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -309,16 +309,11 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen)
 {
 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
-	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 56d8a08ee479..2ebb5e9789f3 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -277,13 +277,9 @@ int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 	u32 *flags = &tfm->crt_flags;
 	int err;
 
-	/* key consists of keys of equal size concatenated, therefore
-	 * the length must be even
-	 */
-	if (keylen % 2) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
 
 	/* first half of xts-key is for crypt */
 	err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
-- 
cgit 


From 92016ba5c1d71fbe4e9952df518b5386f2a0556b Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Tue, 16 Feb 2016 21:56:21 +0000
Subject: PCI: Add fwnode_handle to x86 pci_sysdata

Add an fwnode_handle to the x86 struct pci_sysdata, which will be used to
locate an IRQ domain associated with a root PCI bus.

[bhelgaas: changelog]
Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/include/asm/pci.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 462594320d39..6fc3c7c554b2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -20,6 +20,9 @@ struct pci_sysdata {
 #ifdef CONFIG_X86_64
 	void		*iommu;		/* IOMMU private data */
 #endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+	void		*fwnode;	/* IRQ domain for MSI assignment */
+#endif
 };
 
 extern int pci_routeirq;
@@ -32,6 +35,7 @@ extern int noioapicreroute;
 static inline int pci_domain_nr(struct pci_bus *bus)
 {
 	struct pci_sysdata *sd = bus->sysdata;
+
 	return sd->domain;
 }
 
@@ -41,6 +45,17 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 }
 #endif
 
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
+{
+	struct pci_sysdata *sd = bus->sysdata;
+
+	return sd->fwnode;
+}
+
+#define pci_root_bus_fwnode	_pci_root_bus_fwnode
+#endif
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
-- 
cgit 


From 670e27d809a9a29943e1d2e45823fa4fc16c29f0 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 1 Dec 2015 01:36:39 -0500
Subject: x86 msr-index: Simplify syntax for HWP fields

syntax only, no functional change

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/msr-index.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 690b4027e17c..5c5e7e53d824 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -230,10 +230,10 @@
 #define HWP_PACKAGE_LEVEL_REQUEST_BIT	(1<<11)
 
 /* IA32_HWP_CAPABILITIES */
-#define HWP_HIGHEST_PERF(x)		(x & 0xff)
-#define HWP_GUARANTEED_PERF(x)		((x & (0xff << 8)) >>8)
-#define HWP_MOSTEFFICIENT_PERF(x)	((x & (0xff << 16)) >>16)
-#define HWP_LOWEST_PERF(x)		((x & (0xff << 24)) >>24)
+#define HWP_HIGHEST_PERF(x)		(((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x)		(((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x)	(((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x)		(((x) >> 24) & 0xff)
 
 /* IA32_HWP_REQUEST */
 #define HWP_MIN_PERF(x) 		(x & 0xff)
-- 
cgit 


From e54fdcca70a33a7e447e526b305db85e978c0563 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:01 -0800
Subject: x86/signal/64: Add a comment about sigcontext->fs and gs

These fields have a strange history.  This tries to document it.

This borrows from 9a036b93a344 ("x86/signal/64: Remove 'fs' and 'gs'
from sigcontext"), which was reverted by ed596cde9425 ("Revert x86
sigcontext cleanups").

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/baa78f3c84106fa5acbc319377b1850602f5deec.1455664054.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/sigcontext.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d485232f1e9f..702c40468859 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -341,6 +341,31 @@ struct sigcontext {
 	__u64				rip;
 	__u64				eflags;		/* RFLAGS */
 	__u16				cs;
+
+	/*
+	 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+	 * Linux saved and restored fs and gs in these slots.  This
+	 * was counterproductive, as fsbase and gsbase were never
+	 * saved, so arch_prctl was presumably unreliable.
+	 *
+	 * These slots should never be reused without extreme caution:
+	 *
+	 *  - Some DOSEMU versions stash fs and gs in these slots manually,
+	 *    thus overwriting anything the kernel expects to be preserved
+	 *    in these slots.
+	 *
+	 *  - If these slots are ever needed for any other purpose,
+	 *    there is some risk that very old 64-bit binaries could get
+	 *    confused.  I doubt that many such binaries still work,
+	 *    though, since the same patch in 2.5.64 also removed the
+	 *    64-bit set_thread_area syscall, so it appears that there
+	 *    is no TLS API beyond modify_ldt that works in both pre-
+	 *    and post-2.5.64 kernels.
+	 *
+	 * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+	 * save/restore, it will most likely need to be opt-in and use
+	 * different context slots.
+	 */
 	__u16				gs;
 	__u16				fs;
 	__u16				__pad0;
-- 
cgit 


From 8ff5bd2e1e2767fbf737f84d5f92668dafe7e7b0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:02 -0800
Subject: x86/signal/64: Fix SS if needed when delivering a 64-bit signal

Signals are always delivered to 64-bit tasks with CS set to a long
mode segment.  In long mode, SS doesn't matter as long as it's a
present writable segment.

If SS starts out invalid (this can happen if the signal was caused
by an IRET fault or was delivered on the way out of set_thread_area
or modify_ldt), then IRET to the signal handler can fail, eventually
killing the task.

The straightforward fix would be to simply reset SS when delivering
a signal.  That breaks DOSEMU, though: 64-bit builds of DOSEMU rely
on SS being set to the faulting SS when signals are delivered.

As a compromise, this patch leaves SS alone so long as it's valid.

The net effect should be that the behavior of successfully delivered
signals is unchanged.  Some signals that would previously have
failed to be delivered will now be delivered successfully.

This has no effect for x32 or 32-bit tasks: their signal handlers
were already called with SS == __USER_DS.

(On Xen, there's a slight hole: if a task sets SS to a writable
 *kernel* data segment, then we will fail to identify it as invalid
 and we'll still kill the task.  If anyone cares, this could be fixed
 with a new paravirt hook.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/163c6e1eacde41388f3ff4d2fe6769be651d7b6e.1455664054.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc_defs.h | 23 ++++++++++++++++++
 arch/x86/kernel/signal.c         | 51 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 278441f39856..eb5deb42484d 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -98,4 +98,27 @@ struct desc_ptr {
 
 #endif /* !__ASSEMBLY__ */
 
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA		(0 * (1 << 9))
+#define AR_TYPE_RWDATA		(1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN	(2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN	(3 * (1 << 9))
+#define AR_TYPE_XOCODE		(4 * (1 << 9))
+#define AR_TYPE_XRCODE		(5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF	(6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF	(7 * (1 << 9))
+#define AR_TYPE_MASK		(7 * (1 << 9))
+
+#define AR_DPL0			(0 * (1 << 13))
+#define AR_DPL3			(3 * (1 << 13))
+#define AR_DPL_MASK		(3 * (1 << 13))
+
+#define AR_A			(1 << 8)   /* "Accessed" */
+#define AR_S			(1 << 12)  /* If clear, "System" segment */
+#define AR_P			(1 << 15)  /* "Present" */
+#define AR_AVL			(1 << 20)  /* "AVaiLable" (no HW effect) */
+#define AR_L			(1 << 21)  /* "Long mode" for code segments */
+#define AR_DB			(1 << 22)  /* D/B, effect depends on type */
+#define AR_G			(1 << 23)  /* "Granularity" (limit in pages) */
+
 #endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c07ff5ddbd47..52f82c7ef57d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,6 +61,35 @@
 	regs->seg = GET_SEG(seg) | 3;			\
 } while (0)
 
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
+ * alone.  Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+	u32 ar;
+	asm volatile ("lar %[old_ss], %[ar]\n\t"
+		      "jz 1f\n\t"		/* If invalid: */
+		      "xorl %[ar], %[ar]\n\t"	/* set ar = 0 */
+		      "1:"
+		      : [ar] "=r" (ar)
+		      : [old_ss] "rm" ((u16)regs->ss));
+
+	/*
+	 * For a valid 64-bit user context, we need DPL 3, type
+	 * read-write data or read-write exp-down data, and S and P
+	 * set.  We can't use VERW because VERW doesn't check the
+	 * P bit.
+	 */
+	ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+	if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+	    ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+		regs->ss = __USER_DS;
+}
+#endif
+
 int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 {
 	unsigned long buf_val;
@@ -459,10 +488,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	regs->sp = (unsigned long)frame;
 
-	/* Set up the CS register to run signal handlers in 64-bit mode,
-	   even if the handler happens to be interrupting 32-bit code. */
+	/*
+	 * Set up the CS and SS registers to run signal handlers in
+	 * 64-bit mode, even if the handler happens to be interrupting
+	 * 32-bit or 16-bit code.
+	 *
+	 * SS is subtle.  In 64-bit mode, we don't need any particular
+	 * SS descriptor, but we do need SS to be valid.  It's possible
+	 * that the old SS is entirely bogus -- this can happen if the
+	 * signal we're trying to deliver is #GP or #SS caused by a bad
+	 * SS value.  We also have a compatbility issue here: DOSEMU
+	 * relies on the contents of the SS register indicating the
+	 * SS value at the time of the signal, even though that code in
+	 * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
+	 * avoids relying on sigreturn to restore SS; instead it uses
+	 * a trampoline.)  So we do our best: if the old SS was valid,
+	 * we keep it.  Otherwise we replace it.
+	 */
 	regs->cs = __USER_CS;
 
+	if (unlikely(regs->ss != __USER_DS))
+		force_valid_ss(regs);
+
 	return 0;
 }
 #endif /* CONFIG_X86_32 */
-- 
cgit 


From 6c25da5ad55d48c41b8909bc1f4e3cd5d85bb499 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:03 -0800
Subject: x86/signal/64: Re-add support for SS in the 64-bit signal context

This is a second attempt to make the improvements from c6f2062935c8
("x86/signal/64: Fix SS handling for signals delivered to 64-bit
programs"), which was reverted by 51adbfbba5c6 ("x86/signal/64: Add
support for SS in the 64-bit signal context").

This adds two new uc_flags flags.  UC_SIGCONTEXT_SS will be set for
all 64-bit signals (including x32).  It indicates that the saved SS
field is valid and that the kernel supports the new behavior.

The goal is to fix a problems with signal handling in 64-bit tasks:
SS wasn't saved in the 64-bit signal context, making it awkward to
determine what SS was at the time of signal delivery and making it
impossible to return to a non-flat SS (as calling sigreturn clobbers
SS).

This also made it extremely difficult for 64-bit tasks to return to
fully-defined 16-bit contexts, because only the kernel can easily do
espfix64, but sigreturn was unable to set a non-flag SS:ESP.
(DOSEMU has a monstrous hack to partially work around this
limitation.)

If we could go back in time, the correct fix would be to make 64-bit
signals work just like 32-bit signals with respect to SS: save it
in signal context, reset it when delivering a signal, and restore
it in sigreturn.

Unfortunately, doing that (as I tried originally) breaks DOSEMU:
DOSEMU wouldn't reset the signal context's SS when clearing the LDT
and changing the saved CS to 64-bit mode, since it predates the SS
context field existing in the first place.

This patch is a bit more complicated, and it tries to balance a
bunch of goals.  It makes most cases of changing ucontext->ss during
signal handling work as expected.

I do this by special-casing the interesting case.  On sigreturn,
ucontext->ss will be honored by default, unless the ucontext was
created from scratch by an old program and had a 64-bit CS
(unfortunately, CRIU can do this) or was the result of changing a
32-bit signal context to 64-bit without resetting SS (as DOSEMU
does).

For the benefit of new 64-bit software that uses segmentation (new
versions of DOSEMU might), the new behavior can be detected with a
new ucontext flag UC_SIGCONTEXT_SS.

To avoid compilation issues, __pad0 is left as an alias for ss in
ucontext.

The nitty-gritty details are documented in the header file.

This patch also re-enables the sigreturn_64 and ldt_gdt_64 selftests,
as the kernel change allows both of them to pass.

Tested-by: Stas Sergeev <stsp@list.ru>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/749149cbfc3e75cd7fcdad69a854b399d792cc6f.1455664054.git.luto@kernel.org
[ Small readability edit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/sighandling.h     |  1 -
 arch/x86/include/uapi/asm/sigcontext.h |  7 ++--
 arch/x86/include/uapi/asm/ucontext.h   | 53 +++++++++++++++++++++++++---
 arch/x86/kernel/signal.c               | 63 ++++++++++++++++++++++++----------
 4 files changed, 97 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 89db46752a8f..452c88b8ad06 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,7 +13,6 @@
 			 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask);
 
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 702c40468859..62d4111c1c54 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -256,7 +256,7 @@ struct sigcontext_64 {
 	__u16				cs;
 	__u16				gs;
 	__u16				fs;
-	__u16				__pad0;
+	__u16				ss;
 	__u64				err;
 	__u64				trapno;
 	__u64				oldmask;
@@ -368,7 +368,10 @@ struct sigcontext {
 	 */
 	__u16				gs;
 	__u16				fs;
-	__u16				__pad0;
+	union {
+		__u16			ss;	/* If UC_SIGCONTEXT_SS */
+		__u16			__pad0;	/* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+	};
 	__u64				err;
 	__u64				trapno;
 	__u64				oldmask;
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
index b7c29c8017f2..e3d1ec90616e 100644
--- a/arch/x86/include/uapi/asm/ucontext.h
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -1,11 +1,54 @@
 #ifndef _ASM_X86_UCONTEXT_H
 #define _ASM_X86_UCONTEXT_H
 
-#define UC_FP_XSTATE	0x1	/* indicates the presence of extended state
-				 * information in the memory layout pointed
-				 * by the fpstate pointer in the ucontext's
-				 * sigcontext struct (uc_mcontext).
-				 */
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE	0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ *   with zero or garbage in the SS slot (e.g. old CRIU) and call
+ *   sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ *   context, delete the old SS segment (with modify_ldt), and change
+ *   the saved CS to a 64-bit segment.  These DOSEMU versions expect
+ *   sigreturn to send them back to 64-bit mode without killing them,
+ *   despite the fact that the SS selector when the signal was raised is
+ *   no longer valid.  UC_STRICT_RESTORE_SS will be clear, so the kernel
+ *   will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ *   modifying the saved context will end up in exactly the state they
+ *   started in, even if they were running in a segmented context when
+ *   the signal was raised..  Old kernels would lose track of the
+ *   previous SS value.
+ */
+#define UC_SIGCONTEXT_SS	0x2
+#define UC_STRICT_RESTORE_SS	0x4
+#endif
 
 #include <asm-generic/ucontext.h>
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 52f82c7ef57d..548ddf7d6fd2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -90,7 +90,9 @@ static void force_valid_ss(struct pt_regs *regs)
 }
 #endif
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+static int restore_sigcontext(struct pt_regs *regs,
+			      struct sigcontext __user *sc,
+			      unsigned long uc_flags)
 {
 	unsigned long buf_val;
 	void __user *buf;
@@ -123,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
 		COPY_SEG_CPL3(cs);
 		COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-		/* Kernel saves and restores only the CS segment register on signals,
-		 * which is the bare minimum needed to allow mixed 32/64-bit code.
-		 * App's signal handler can save/restore other segments if needed. */
-		COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+		/*
+		 * Fix up SS if needed for the benefit of old DOSEMU and
+		 * CRIU.
+		 */
+		if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
+			     user_64bit_mode(regs)))
+			force_valid_ss(regs);
+#endif
 
 		get_user_ex(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -194,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->cs, &sc->cs);
 		put_user_ex(0, &sc->gs);
 		put_user_ex(0, &sc->fs);
+		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
 		put_user_ex(fpstate, &sc->fpstate);
@@ -432,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	return 0;
 }
 #else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+	unsigned long flags;
+
+	if (cpu_has_xsave)
+		flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+	else
+		flags = UC_SIGCONTEXT_SS;
+
+	if (likely(user_64bit_mode(regs)))
+		flags |= UC_STRICT_RESTORE_SS;
+
+	return flags;
+}
+
 static int __setup_rt_frame(int sig, struct ksignal *ksig,
 			    sigset_t *set, struct pt_regs *regs)
 {
@@ -451,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	put_user_try {
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
+		put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
 		put_user_ex(0, &frame->uc.uc_link);
 		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
 
@@ -536,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	put_user_try {
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
+		put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
 		put_user_ex(0, &frame->uc.uc_link);
 		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
 		put_user_ex(0, &frame->uc.uc__pad0);
@@ -601,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->sc))
+	/*
+	 * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+	 * Save a few cycles by skipping the __get_user.
+	 */
+	if (restore_sigcontext(regs, &frame->sc, 0))
 		goto badframe;
 	return regs->ax;
 
@@ -617,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
 	sigset_t set;
+	unsigned long uc_flags;
 
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
+	if (__get_user(uc_flags, &frame->uc.uc_flags))
+		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
@@ -813,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe_x32 __user *frame;
 	sigset_t set;
+	unsigned long uc_flags;
 
 	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
@@ -820,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
+	if (__get_user(uc_flags, &frame->uc.uc_flags))
+		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
-- 
cgit 


From 84aba677f009e20185aea322563389ad56e0ef7e Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Tue, 16 Feb 2016 09:43:19 +0100
Subject: x86/microcode: Remove unnecessary paravirt_enabled check

Commit:

  a18a0f6850d4 ("x86, microcode: Don't initialize microcode code on paravirt")

added a paravirt test in microcode_init(), primarily to avoid making
mc_bp_resume()->load_ucode_ap()->check_loader_disabled_ap() calls
because on 32-bit kernels this callchain ends up using __pa_nodebug()
macro which is invalid for Xen PV guests.

A subsequent commit:

  fbae4ba8c4a3 ("x86, microcode: Reload microcode on resume")

eliminated this callchain thus making a18a0f6850d4 unnecessary.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1455612202-14414-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index cea8552e2b3a..ac360bfbbdb6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -623,7 +623,7 @@ int __init microcode_init(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	int error;
 
-	if (paravirt_enabled() || dis_ucode_ldr)
+	if (dis_ucode_ldr)
 		return -EINVAL;
 
 	if (c->x86_vendor == X86_VENDOR_INTEL)
-- 
cgit 


From 9cc6f743c7724eb9abaf27904194c169db85dd31 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Tue, 16 Feb 2016 09:43:20 +0100
Subject: x86/microcode: Use kmemdup() rather than duplicating its
 implementation

The patch was generated using fixed coccinelle semantic patch
scripts/coccinelle/api/memdup.cocci.

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455612202-14414-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c   | 4 +---
 arch/x86/kernel/cpu/microcode/intel.c | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index f66cbfe74ce4..e397fc160b05 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -788,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
 		return -EINVAL;
 	}
 
-	patch->data = kzalloc(patch_size, GFP_KERNEL);
+	patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
 	if (!patch->data) {
 		pr_err("Patch data allocation failure.\n");
 		kfree(patch);
 		return -EINVAL;
 	}
 
-	/* All looks ok, copy patch... */
-	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
 	INIT_LIST_HEAD(&patch->plist);
 	patch->patch_id  = mc_hdr->patch_id;
 	patch->equiv_cpu = proc_id;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cb397947f688..cbb3cf09b065 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -210,13 +210,11 @@ save_microcode(struct mc_saved_data *mcs,
 		mc_hdr = &mc->hdr;
 		size   = get_totalsize(mc_hdr);
 
-		saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+		saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
 		if (!saved_ptr[i]) {
 			ret = -ENOMEM;
 			goto err;
 		}
-
-		memcpy(saved_ptr[i], mc, size);
 	}
 
 	/*
-- 
cgit 


From f1b92bb6b5a4e17b508f128b084fa00e0eda590c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 09:43:21 +0100
Subject: x86/ftrace, x86/asm: Kill ftrace_caller_end label

One of ftrace_caller_end and ftrace_return is redundant so unify them.
Rename ftrace_return to ftrace_epilogue to mean that everything after
that label represents, like an afterword, work which happens *after* the
ftrace call, e.g., the function graph tracer for one.

Steve wants this to rather mean "[a]n event which reflects meaningfully
on a recently ended conflict or struggle." I can imagine that ftrace can
be a struggle sometimes.

Anyway, beef up the comment about the code contents and layout before
ftrace_epilogue label.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455612202-14414-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ftrace.c    | 11 +++++------
 arch/x86/kernel/mcount_64.S | 14 ++++++++------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 29408d6d6626..04f9641e0cb6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { }
 #endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
-extern void ftrace_caller_end(void);
 extern void ftrace_regs_caller_end(void);
-extern void ftrace_return(void);
+extern void ftrace_epilogue(void);
 extern void ftrace_caller_op_ptr(void);
 extern void ftrace_regs_caller_op_ptr(void);
 
@@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 		op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
 	} else {
 		start_offset = (unsigned long)ftrace_caller;
-		end_offset = (unsigned long)ftrace_caller_end;
+		end_offset = (unsigned long)ftrace_epilogue;
 		op_offset = (unsigned long)ftrace_caller_op_ptr;
 	}
 
@@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	/*
 	 * Allocate enough size to store the ftrace_caller code,
-	 * the jmp to ftrace_return, as well as the address of
+	 * the jmp to ftrace_epilogue, as well as the address of
 	 * the ftrace_ops this trampoline is used for.
 	 */
 	trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
@@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	ip = (unsigned long)trampoline + size;
 
-	/* The trampoline ends with a jmp to ftrace_return */
-	jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return);
+	/* The trampoline ends with a jmp to ftrace_epilogue */
+	jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
 	memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
 
 	/*
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 87e1762e2bca..ed48a9f465f8 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -168,12 +168,14 @@ GLOBAL(ftrace_call)
 	restore_mcount_regs
 
 	/*
-	 * The copied trampoline must call ftrace_return as it
+	 * The copied trampoline must call ftrace_epilogue as it
 	 * still may need to call the function graph tracer.
+	 *
+	 * The code up to this label is copied into trampolines so
+	 * think twice before adding any new code or changing the
+	 * layout here.
 	 */
-GLOBAL(ftrace_caller_end)
-
-GLOBAL(ftrace_return)
+GLOBAL(ftrace_epilogue)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 GLOBAL(ftrace_graph_call)
@@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call)
 	popfq
 
 	/*
-	 * As this jmp to ftrace_return can be a short jump
+	 * As this jmp to ftrace_epilogue can be a short jump
 	 * it must not be copied into the trampoline.
 	 * The trampoline will add the code to jump
 	 * to the return.
 	 */
 GLOBAL(ftrace_regs_caller_end)
 
-	jmp ftrace_return
+	jmp ftrace_epilogue
 
 END(ftrace_regs_caller)
 
-- 
cgit 


From 053080a9d1c8cf1950115ad92ce94242ebc5f25c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 09:43:22 +0100
Subject: x86/msr: Document msr-index.h rule for addition

In order to keep this file's size sensible and not cause too much
unnecessary churn, make the rule explicit - similar to pci_ids.h - that
only MSRs which are used in multiple compilation units, should get added
to it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: alex.williamson@redhat.com
Cc: gleb@kernel.org
Cc: joro@8bytes.org
Cc: kvm@vger.kernel.org
Cc: sherry.hurwitz@amd.com
Cc: wei@redhat.com
Link: http://lkml.kernel.org/r/1455612202-14414-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402ef3b84..984ab75bf621 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,7 +1,12 @@
 #ifndef _ASM_X86_MSR_INDEX_H
 #define _ASM_X86_MSR_INDEX_H
 
-/* CPU model specific register (MSR) numbers */
+/*
+ * CPU model specific register (MSR) numbers.
+ *
+ * Do not add new entries to this file unless the definitions are shared
+ * between multiple compilation units.
+ */
 
 /* x86-64 specific MSRs */
 #define MSR_EFER		0xc0000080 /* extended feature register */
-- 
cgit 


From 4e79e182b419172e35936a47f098509092d69817 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Feb 2016 14:15:27 -0800
Subject: x86/entry/compat: Keep TS_COMPAT set during signal delivery

Signal delivery needs to know the sign of an interrupted syscall's
return value in order to detect -ERESTART variants.  Normally this
works independently of bitness because syscalls internally return
long.  Under ptrace, however, this can break, and syscall_get_error
is supposed to sign-extend regs->ax if needed.

We were clearing TS_COMPAT too early, though, and this prevented
sign extension, which subtly broke syscall restart under ptrace.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # 4.3.x-
Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C")
Link: http://lkml.kernel.org/r/cbce3cf545522f64eb37f5478cb59746230db3b5.1455142412.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index c6ab2ebb5f4f..1a000f59f04a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -269,6 +269,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 /* Called with IRQs disabled. */
 __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 {
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	u32 cached_flags;
 
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -276,12 +277,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 
 	lockdep_sys_exit();
 
-	cached_flags =
-		READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+	cached_flags = READ_ONCE(ti->flags);
 
 	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
 		exit_to_usermode_loop(regs, cached_flags);
 
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.  We need to clear it *after* signal
+	 * handling, because syscall restart has a fixup for compat
+	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
+	 * selftest.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
 	user_enter();
 }
 
@@ -333,14 +344,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
 		syscall_slow_exit_work(regs, cached_flags);
 
-#ifdef CONFIG_COMPAT
-	/*
-	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
-	 * returning to user mode.
-	 */
-	ti->status &= ~TS_COMPAT;
-#endif
-
 	local_irq_disable();
 	prepare_exit_to_usermode(regs);
 }
-- 
cgit 


From af5d3aabc04a4b7732b1d3404feebadfe5ae9362 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:07 +0100
Subject: perf/x86: Move perf_event_intel_bts.c ........ =>
 x86/events/intel/bts.c

Start moving the Intel bits.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                   |   1 +
 arch/x86/events/intel/bts.c                | 544 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile               |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 544 -----------------------------
 4 files changed, 546 insertions(+), 545 deletions(-)
 create mode 100644 arch/x86/events/intel/bts.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_bts.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 7d1ecff583b0..bcd1e214a76d 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/bts.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
new file mode 100644
index 000000000000..2bd4833367a7
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,544 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+struct bts_ctx {
+	struct perf_output_handle	handle;
+	struct debug_store		ds_back;
+	int				started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE		24
+#define BTS_SAFETY_MARGIN	4080
+
+struct bts_phys {
+	struct page	*page;
+	unsigned long	size;
+	unsigned long	offset;
+	unsigned long	displacement;
+};
+
+struct bts_buffer {
+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
+	unsigned int	nr_pages;
+	unsigned int	nr_bufs;
+	unsigned int	cur_buf;
+	bool		snapshot;
+	local_t		data_size;
+	local_t		lost;
+	local_t		head;
+	unsigned long	end;
+	void		**data_pages;
+	struct bts_phys	buf[0];
+};
+
+struct pmu bts_pmu;
+
+static size_t buf_size(struct page *page)
+{
+	return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+	struct bts_buffer *buf;
+	struct page *page;
+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	unsigned long offset;
+	size_t size = nr_pages << PAGE_SHIFT;
+	int pg, nbuf, pad;
+
+	/* count all the high order buffers */
+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
+		page = virt_to_page(pages[pg]);
+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+			return NULL;
+		pg += 1 << page_private(page);
+		nbuf++;
+	}
+
+	/*
+	 * to avoid interrupts in overwrite mode, only allow one physical
+	 */
+	if (overwrite && nbuf > 1)
+		return NULL;
+
+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->nr_bufs = nbuf;
+	buf->snapshot = overwrite;
+	buf->data_pages = pages;
+	buf->real_size = size - size % BTS_RECORD_SIZE;
+
+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+		unsigned int __nr_pages;
+
+		page = virt_to_page(pages[pg]);
+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+		buf->buf[nbuf].page = page;
+		buf->buf[nbuf].offset = offset;
+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+		buf->buf[nbuf].size -= pad;
+
+		pg += __nr_pages;
+		offset += __nr_pages << PAGE_SHIFT;
+	}
+
+	return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+	kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+	return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
+	unsigned long index, thresh = 0, end = phys->size;
+	struct page *page = phys->page;
+
+	index = local_read(&buf->head);
+
+	if (!buf->snapshot) {
+		if (buf->end < phys->offset + buf_size(page))
+			end = buf->end - phys->offset - phys->displacement;
+
+		index -= phys->offset + phys->displacement;
+
+		if (end - index > BTS_SAFETY_MARGIN)
+			thresh = end - BTS_SAFETY_MARGIN;
+		else if (end - index > BTS_RECORD_SIZE)
+			thresh = end - BTS_RECORD_SIZE;
+		else
+			thresh = end;
+	}
+
+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+	ds->bts_index = ds->bts_buffer_base + index;
+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+	ds->bts_interrupt_threshold = !buf->snapshot
+		? ds->bts_buffer_base + thresh
+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+	unsigned long index = head - phys->offset;
+
+	memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= bts->handle.size ||
+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+		return true;
+
+	return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+	if (!buf)
+		return;
+
+	head = index + bts_buffer_offset(buf, buf->cur_buf);
+	old = local_xchg(&buf->head, head);
+
+	if (!buf->snapshot) {
+		if (old == head)
+			return;
+
+		if (ds->bts_index >= ds->bts_absolute_maximum)
+			local_inc(&buf->lost);
+
+		/*
+		 * old and head are always in the same physical buffer, so we
+		 * can subtract them to get the data size.
+		 */
+		local_add(head - old, &buf->data_size);
+	} else {
+		local_set(&buf->data_size, head);
+	}
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	u64 config = 0;
+
+	if (!buf || bts_buffer_is_full(buf, bts))
+		return;
+
+	event->hw.itrace_started = 1;
+	event->hw.state = 0;
+
+	if (!buf->snapshot)
+		config |= ARCH_PERFMON_EVENTSEL_INT;
+	if (!event->attr.exclude_kernel)
+		config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		config |= ARCH_PERFMON_EVENTSEL_USR;
+
+	bts_config_buffer(buf);
+
+	/*
+	 * local barrier to make sure that ds configuration made it
+	 * before we enable BTS
+	 */
+	wmb();
+
+	intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	__bts_event_start(event);
+
+	/* PMI handler: this counter is running and likely generating PMIs */
+	ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+	/*
+	 * No extra synchronization is mandated by the documentation to have
+	 * BTS data stores globally visible.
+	 */
+	intel_pmu_disable_bts();
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	/* PMI handler: don't restart this counter */
+	ACCESS_ONCE(bts->started) = 0;
+
+	__bts_event_stop(event);
+
+	if (flags & PERF_EF_UPDATE)
+		bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event && bts->started)
+		__bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event)
+		__bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
+	unsigned int next_buf;
+	struct bts_phys *phys, *next_phys;
+	int ret;
+
+	if (buf->snapshot)
+		return 0;
+
+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
+		return -EINVAL;
+
+	phys = &buf->buf[buf->cur_buf];
+	space = phys->offset + phys->displacement + phys->size - head;
+	pad = space;
+	if (space > handle->size) {
+		space = handle->size;
+		space -= space % BTS_RECORD_SIZE;
+	}
+	if (space <= BTS_SAFETY_MARGIN) {
+		/* See if next phys buffer has more space */
+		next_buf = buf->cur_buf + 1;
+		if (next_buf >= buf->nr_bufs)
+			next_buf = 0;
+		next_phys = &buf->buf[next_buf];
+		gap = buf_size(phys->page) - phys->displacement - phys->size +
+		      next_phys->displacement;
+		skip = pad + gap;
+		if (handle->size >= skip) {
+			next_space = next_phys->size;
+			if (next_space + skip > handle->size) {
+				next_space = handle->size - skip;
+				next_space -= next_space % BTS_RECORD_SIZE;
+			}
+			if (next_space > space || !space) {
+				if (pad)
+					bts_buffer_pad_out(phys, head);
+				ret = perf_aux_output_skip(handle, skip);
+				if (ret)
+					return ret;
+				/* Advance to next phys buffer */
+				phys = next_phys;
+				space = next_space;
+				head = phys->offset + phys->displacement;
+				/*
+				 * After this, cur_buf and head won't match ds
+				 * anymore, so we must not be racing with
+				 * bts_update().
+				 */
+				buf->cur_buf = next_buf;
+				local_set(&buf->head, head);
+			}
+		}
+	}
+
+	/* Don't go far beyond wakeup watermark */
+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+		 handle->head;
+	if (space > wakeup) {
+		space = wakeup;
+		space -= space % BTS_RECORD_SIZE;
+	}
+
+	buf->end = head + space;
+
+	/*
+	 * If we have no space, the lost notification would have been sent when
+	 * we hit absolute_maximum - see bts_update()
+	 */
+	if (!space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct perf_event *event = bts->handle.event;
+	struct bts_buffer *buf;
+	s64 old_head;
+	int err;
+
+	if (!event || !bts->started)
+		return 0;
+
+	buf = perf_get_aux(&bts->handle);
+	/*
+	 * Skip snapshot counters: they don't use the interrupt, but
+	 * there's no other way of telling, because the pointer will
+	 * keep moving
+	 */
+	if (!buf || buf->snapshot)
+		return 0;
+
+	old_head = local_read(&buf->head);
+	bts_update(bts);
+
+	/* no new data */
+	if (old_head == local_read(&buf->head))
+		return 0;
+
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+			    !!local_xchg(&buf->lost, 0));
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return 1;
+
+	err = bts_buffer_reset(buf, &bts->handle);
+	if (err)
+		perf_aux_output_end(&bts->handle, 0, false);
+
+	return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+	bts_event_stop(event, PERF_EF_UPDATE);
+
+	if (buf) {
+		if (buf->snapshot)
+			bts->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+				    !!local_xchg(&buf->lost, 0));
+	}
+
+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+	struct bts_buffer *buf;
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		return -EBUSY;
+
+	if (bts->handle.event)
+		return -EBUSY;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return -EINVAL;
+
+	ret = bts_buffer_reset(buf, &bts->handle);
+	if (ret) {
+		perf_aux_output_end(&bts->handle, 0, false);
+		return ret;
+	}
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	if (mode & PERF_EF_START) {
+		bts_event_start(event, 0);
+		if (hwc->state & PERF_HES_STOPPED) {
+			bts_event_del(event, 0);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+	x86_release_hardware();
+	x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+	int ret;
+
+	if (event->attr.type != bts_pmu.type)
+		return -ENOENT;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
+		return -EBUSY;
+
+	/*
+	 * BTS leaks kernel addresses even when CPL0 tracing is
+	 * disabled, so disallow intel_bts driver for unprivileged
+	 * users on paranoid systems since it provides trace data
+	 * to the user in a zero-copy fashion.
+	 *
+	 * Note that the default paranoia setting permits unprivileged
+	 * users to profile the kernel.
+	 */
+	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	ret = x86_reserve_hardware();
+	if (ret) {
+		x86_del_exclusive(x86_lbr_exclusive_bts);
+		return ret;
+	}
+
+	event->destroy = bts_event_destroy;
+
+	return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+		return -ENODEV;
+
+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+	bts_pmu.task_ctx_nr	= perf_sw_context;
+	bts_pmu.event_init	= bts_event_init;
+	bts_pmu.add		= bts_event_add;
+	bts_pmu.del		= bts_event_del;
+	bts_pmu.start		= bts_event_start;
+	bts_pmu.stop		= bts_event_stop;
+	bts_pmu.read		= bts_event_read;
+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
+	bts_pmu.free_aux	= bts_buffer_free_aux;
+
+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7edbeb945be5..e28f93158016 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,7 +34,7 @@ ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
deleted file mode 100644
index 2cad71d1b14c..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * BTS PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/coredump.h>
-
-#include <asm-generic/sizes.h>
-#include <asm/perf_event.h>
-
-#include "perf_event.h"
-
-struct bts_ctx {
-	struct perf_output_handle	handle;
-	struct debug_store		ds_back;
-	int				started;
-};
-
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
-
-#define BTS_RECORD_SIZE		24
-#define BTS_SAFETY_MARGIN	4080
-
-struct bts_phys {
-	struct page	*page;
-	unsigned long	size;
-	unsigned long	offset;
-	unsigned long	displacement;
-};
-
-struct bts_buffer {
-	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
-	unsigned int	nr_pages;
-	unsigned int	nr_bufs;
-	unsigned int	cur_buf;
-	bool		snapshot;
-	local_t		data_size;
-	local_t		lost;
-	local_t		head;
-	unsigned long	end;
-	void		**data_pages;
-	struct bts_phys	buf[0];
-};
-
-struct pmu bts_pmu;
-
-static size_t buf_size(struct page *page)
-{
-	return 1 << (PAGE_SHIFT + page_private(page));
-}
-
-static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
-{
-	struct bts_buffer *buf;
-	struct page *page;
-	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-	unsigned long offset;
-	size_t size = nr_pages << PAGE_SHIFT;
-	int pg, nbuf, pad;
-
-	/* count all the high order buffers */
-	for (pg = 0, nbuf = 0; pg < nr_pages;) {
-		page = virt_to_page(pages[pg]);
-		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
-			return NULL;
-		pg += 1 << page_private(page);
-		nbuf++;
-	}
-
-	/*
-	 * to avoid interrupts in overwrite mode, only allow one physical
-	 */
-	if (overwrite && nbuf > 1)
-		return NULL;
-
-	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
-	if (!buf)
-		return NULL;
-
-	buf->nr_pages = nr_pages;
-	buf->nr_bufs = nbuf;
-	buf->snapshot = overwrite;
-	buf->data_pages = pages;
-	buf->real_size = size - size % BTS_RECORD_SIZE;
-
-	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-		unsigned int __nr_pages;
-
-		page = virt_to_page(pages[pg]);
-		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
-		buf->buf[nbuf].page = page;
-		buf->buf[nbuf].offset = offset;
-		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
-		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
-		buf->buf[nbuf].size -= pad;
-
-		pg += __nr_pages;
-		offset += __nr_pages << PAGE_SHIFT;
-	}
-
-	return buf;
-}
-
-static void bts_buffer_free_aux(void *data)
-{
-	kfree(data);
-}
-
-static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
-{
-	return buf->buf[idx].offset + buf->buf[idx].displacement;
-}
-
-static void
-bts_config_buffer(struct bts_buffer *buf)
-{
-	int cpu = raw_smp_processor_id();
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	struct bts_phys *phys = &buf->buf[buf->cur_buf];
-	unsigned long index, thresh = 0, end = phys->size;
-	struct page *page = phys->page;
-
-	index = local_read(&buf->head);
-
-	if (!buf->snapshot) {
-		if (buf->end < phys->offset + buf_size(page))
-			end = buf->end - phys->offset - phys->displacement;
-
-		index -= phys->offset + phys->displacement;
-
-		if (end - index > BTS_SAFETY_MARGIN)
-			thresh = end - BTS_SAFETY_MARGIN;
-		else if (end - index > BTS_RECORD_SIZE)
-			thresh = end - BTS_RECORD_SIZE;
-		else
-			thresh = end;
-	}
-
-	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
-	ds->bts_index = ds->bts_buffer_base + index;
-	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
-	ds->bts_interrupt_threshold = !buf->snapshot
-		? ds->bts_buffer_base + thresh
-		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
-}
-
-static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
-{
-	unsigned long index = head - phys->offset;
-
-	memset(page_address(phys->page) + index, 0, phys->size - index);
-}
-
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-	if (buf->snapshot)
-		return false;
-
-	if (local_read(&buf->data_size) >= bts->handle.size ||
-	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-		return true;
-
-	return false;
-}
-
-static void bts_update(struct bts_ctx *bts)
-{
-	int cpu = raw_smp_processor_id();
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
-
-	if (!buf)
-		return;
-
-	head = index + bts_buffer_offset(buf, buf->cur_buf);
-	old = local_xchg(&buf->head, head);
-
-	if (!buf->snapshot) {
-		if (old == head)
-			return;
-
-		if (ds->bts_index >= ds->bts_absolute_maximum)
-			local_inc(&buf->lost);
-
-		/*
-		 * old and head are always in the same physical buffer, so we
-		 * can subtract them to get the data size.
-		 */
-		local_add(head - old, &buf->data_size);
-	} else {
-		local_set(&buf->data_size, head);
-	}
-}
-
-static void __bts_event_start(struct perf_event *event)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-	u64 config = 0;
-
-	if (!buf || bts_buffer_is_full(buf, bts))
-		return;
-
-	event->hw.itrace_started = 1;
-	event->hw.state = 0;
-
-	if (!buf->snapshot)
-		config |= ARCH_PERFMON_EVENTSEL_INT;
-	if (!event->attr.exclude_kernel)
-		config |= ARCH_PERFMON_EVENTSEL_OS;
-	if (!event->attr.exclude_user)
-		config |= ARCH_PERFMON_EVENTSEL_USR;
-
-	bts_config_buffer(buf);
-
-	/*
-	 * local barrier to make sure that ds configuration made it
-	 * before we enable BTS
-	 */
-	wmb();
-
-	intel_pmu_enable_bts(config);
-}
-
-static void bts_event_start(struct perf_event *event, int flags)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	__bts_event_start(event);
-
-	/* PMI handler: this counter is running and likely generating PMIs */
-	ACCESS_ONCE(bts->started) = 1;
-}
-
-static void __bts_event_stop(struct perf_event *event)
-{
-	/*
-	 * No extra synchronization is mandated by the documentation to have
-	 * BTS data stores globally visible.
-	 */
-	intel_pmu_disable_bts();
-
-	if (event->hw.state & PERF_HES_STOPPED)
-		return;
-
-	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
-}
-
-static void bts_event_stop(struct perf_event *event, int flags)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	/* PMI handler: don't restart this counter */
-	ACCESS_ONCE(bts->started) = 0;
-
-	__bts_event_stop(event);
-
-	if (flags & PERF_EF_UPDATE)
-		bts_update(bts);
-}
-
-void intel_bts_enable_local(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	if (bts->handle.event && bts->started)
-		__bts_event_start(bts->handle.event);
-}
-
-void intel_bts_disable_local(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	if (bts->handle.event)
-		__bts_event_stop(bts->handle.event);
-}
-
-static int
-bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
-{
-	unsigned long head, space, next_space, pad, gap, skip, wakeup;
-	unsigned int next_buf;
-	struct bts_phys *phys, *next_phys;
-	int ret;
-
-	if (buf->snapshot)
-		return 0;
-
-	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-	if (WARN_ON_ONCE(head != local_read(&buf->head)))
-		return -EINVAL;
-
-	phys = &buf->buf[buf->cur_buf];
-	space = phys->offset + phys->displacement + phys->size - head;
-	pad = space;
-	if (space > handle->size) {
-		space = handle->size;
-		space -= space % BTS_RECORD_SIZE;
-	}
-	if (space <= BTS_SAFETY_MARGIN) {
-		/* See if next phys buffer has more space */
-		next_buf = buf->cur_buf + 1;
-		if (next_buf >= buf->nr_bufs)
-			next_buf = 0;
-		next_phys = &buf->buf[next_buf];
-		gap = buf_size(phys->page) - phys->displacement - phys->size +
-		      next_phys->displacement;
-		skip = pad + gap;
-		if (handle->size >= skip) {
-			next_space = next_phys->size;
-			if (next_space + skip > handle->size) {
-				next_space = handle->size - skip;
-				next_space -= next_space % BTS_RECORD_SIZE;
-			}
-			if (next_space > space || !space) {
-				if (pad)
-					bts_buffer_pad_out(phys, head);
-				ret = perf_aux_output_skip(handle, skip);
-				if (ret)
-					return ret;
-				/* Advance to next phys buffer */
-				phys = next_phys;
-				space = next_space;
-				head = phys->offset + phys->displacement;
-				/*
-				 * After this, cur_buf and head won't match ds
-				 * anymore, so we must not be racing with
-				 * bts_update().
-				 */
-				buf->cur_buf = next_buf;
-				local_set(&buf->head, head);
-			}
-		}
-	}
-
-	/* Don't go far beyond wakeup watermark */
-	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
-		 handle->head;
-	if (space > wakeup) {
-		space = wakeup;
-		space -= space % BTS_RECORD_SIZE;
-	}
-
-	buf->end = head + space;
-
-	/*
-	 * If we have no space, the lost notification would have been sent when
-	 * we hit absolute_maximum - see bts_update()
-	 */
-	if (!space)
-		return -ENOSPC;
-
-	return 0;
-}
-
-int intel_bts_interrupt(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct perf_event *event = bts->handle.event;
-	struct bts_buffer *buf;
-	s64 old_head;
-	int err;
-
-	if (!event || !bts->started)
-		return 0;
-
-	buf = perf_get_aux(&bts->handle);
-	/*
-	 * Skip snapshot counters: they don't use the interrupt, but
-	 * there's no other way of telling, because the pointer will
-	 * keep moving
-	 */
-	if (!buf || buf->snapshot)
-		return 0;
-
-	old_head = local_read(&buf->head);
-	bts_update(bts);
-
-	/* no new data */
-	if (old_head == local_read(&buf->head))
-		return 0;
-
-	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-			    !!local_xchg(&buf->lost, 0));
-
-	buf = perf_aux_output_begin(&bts->handle, event);
-	if (!buf)
-		return 1;
-
-	err = bts_buffer_reset(buf, &bts->handle);
-	if (err)
-		perf_aux_output_end(&bts->handle, 0, false);
-
-	return 1;
-}
-
-static void bts_event_del(struct perf_event *event, int mode)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
-	bts_event_stop(event, PERF_EF_UPDATE);
-
-	if (buf) {
-		if (buf->snapshot)
-			bts->handle.head =
-				local_xchg(&buf->data_size,
-					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-				    !!local_xchg(&buf->lost, 0));
-	}
-
-	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
-}
-
-static int bts_event_add(struct perf_event *event, int mode)
-{
-	struct bts_buffer *buf;
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	int ret = -EBUSY;
-
-	event->hw.state = PERF_HES_STOPPED;
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		return -EBUSY;
-
-	if (bts->handle.event)
-		return -EBUSY;
-
-	buf = perf_aux_output_begin(&bts->handle, event);
-	if (!buf)
-		return -EINVAL;
-
-	ret = bts_buffer_reset(buf, &bts->handle);
-	if (ret) {
-		perf_aux_output_end(&bts->handle, 0, false);
-		return ret;
-	}
-
-	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
-	if (mode & PERF_EF_START) {
-		bts_event_start(event, 0);
-		if (hwc->state & PERF_HES_STOPPED) {
-			bts_event_del(event, 0);
-			return -EBUSY;
-		}
-	}
-
-	return 0;
-}
-
-static void bts_event_destroy(struct perf_event *event)
-{
-	x86_release_hardware();
-	x86_del_exclusive(x86_lbr_exclusive_bts);
-}
-
-static int bts_event_init(struct perf_event *event)
-{
-	int ret;
-
-	if (event->attr.type != bts_pmu.type)
-		return -ENOENT;
-
-	if (x86_add_exclusive(x86_lbr_exclusive_bts))
-		return -EBUSY;
-
-	/*
-	 * BTS leaks kernel addresses even when CPL0 tracing is
-	 * disabled, so disallow intel_bts driver for unprivileged
-	 * users on paranoid systems since it provides trace data
-	 * to the user in a zero-copy fashion.
-	 *
-	 * Note that the default paranoia setting permits unprivileged
-	 * users to profile the kernel.
-	 */
-	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	ret = x86_reserve_hardware();
-	if (ret) {
-		x86_del_exclusive(x86_lbr_exclusive_bts);
-		return ret;
-	}
-
-	event->destroy = bts_event_destroy;
-
-	return 0;
-}
-
-static void bts_event_read(struct perf_event *event)
-{
-}
-
-static __init int bts_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
-		return -ENODEV;
-
-	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
-	bts_pmu.task_ctx_nr	= perf_sw_context;
-	bts_pmu.event_init	= bts_event_init;
-	bts_pmu.add		= bts_event_add;
-	bts_pmu.del		= bts_event_del;
-	bts_pmu.start		= bts_event_start;
-	bts_pmu.stop		= bts_event_stop;
-	bts_pmu.read		= bts_event_read;
-	bts_pmu.setup_aux	= bts_buffer_setup_aux;
-	bts_pmu.free_aux	= bts_buffer_free_aux;
-
-	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
-}
-arch_initcall(bts_init);
-- 
cgit 


From e1069839dd6893d2135b2fc4d96e5d03d73c2c3d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:08 +0100
Subject: perf/x86: Move perf_event_intel.c ............ =>
 x86/events/intel/core.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile               |    2 +-
 arch/x86/events/intel/core.c           | 3773 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile           |    2 +-
 arch/x86/kernel/cpu/perf_event_intel.c | 3773 --------------------------------
 4 files changed, 3775 insertions(+), 3775 deletions(-)
 create mode 100644 arch/x86/events/intel/core.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index bcd1e214a76d..834e9ae507f9 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
new file mode 100644
index 000000000000..1edf3013e528
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,3773 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
+	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+	INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+	INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+	INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+	INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+	INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+	INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+	INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+	INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+	INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7,
+			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7,
+			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	/*
+	 * Note the low 8 bits eventsel code is not a continuous field, containing
+	 * some #GPing bits. These are masked out.
+	 */
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+	EVENT_PTR(mem_ld_nhm),
+	NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+	EVENT_PTR(mem_ld_snb),
+	EVENT_PTR(mem_st_snb),
+	NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+	/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+	return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
+#define SKL_DEMAND_RFO			BIT_ULL(1)
+#define SKL_ANY_RESPONSE		BIT_ULL(16)
+#define SKL_SUPPLIER_NONE		BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
+#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT			BIT_ULL(30)
+#define SKL_SNOOP_NONE			BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define SKL_SNOOP_MISS			BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define SKL_SNOOP_HITM			BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
+#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+#define SNB_DMND_DATA_RD	(1ULL << 0)
+#define SNB_DMND_RFO		(1ULL << 1)
+#define SNB_DMND_IFETCH		(1ULL << 2)
+#define SNB_DMND_WB		(1ULL << 3)
+#define SNB_PF_DATA_RD		(1ULL << 4)
+#define SNB_PF_RFO		(1ULL << 5)
+#define SNB_PF_IFETCH		(1ULL << 6)
+#define SNB_LLC_DATA_RD		(1ULL << 7)
+#define SNB_LLC_RFO		(1ULL << 8)
+#define SNB_LLC_IFETCH		(1ULL << 9)
+#define SNB_BUS_LOCKS		(1ULL << 10)
+#define SNB_STRM_ST		(1ULL << 11)
+#define SNB_OTHER		(1ULL << 15)
+#define SNB_RESP_ANY		(1ULL << 16)
+#define SNB_NO_SUPP		(1ULL << 17)
+#define SNB_LLC_HITM		(1ULL << 18)
+#define SNB_LLC_HITE		(1ULL << 19)
+#define SNB_LLC_HITS		(1ULL << 20)
+#define SNB_LLC_HITF		(1ULL << 21)
+#define SNB_LOCAL		(1ULL << 22)
+#define SNB_REMOTE		(0xffULL << 23)
+#define SNB_SNP_NONE		(1ULL << 31)
+#define SNB_SNP_NOT_NEEDED	(1ULL << 32)
+#define SNB_SNP_MISS		(1ULL << 33)
+#define SNB_NO_FWD		(1ULL << 34)
+#define SNB_SNP_FWD		(1ULL << 35)
+#define SNB_HITM		(1ULL << 36)
+#define SNB_NON_DRAM		(1ULL << 37)
+
+#define SNB_DMND_READ		(SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+				 SNB_HITM)
+
+#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS		SNB_RESP_ANY
+#define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+	},
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+		[ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD	(1 << 0)
+#define NHM_DMND_RFO		(1 << 1)
+#define NHM_DMND_IFETCH		(1 << 2)
+#define NHM_DMND_WB		(1 << 3)
+#define NHM_PF_DATA_RD		(1 << 4)
+#define NHM_PF_DATA_RFO		(1 << 5)
+#define NHM_PF_IFETCH		(1 << 6)
+#define NHM_OFFCORE_OTHER	(1 << 7)
+#define NHM_UNCORE_HIT		(1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP	(1 << 9)
+#define NHM_OTHER_CORE_HITM	(1 << 10)
+        			/* reserved */
+#define NHM_REMOTE_CACHE_FWD	(1 << 12)
+#define NHM_REMOTE_DRAM		(1 << 13)
+#define NHM_LOCAL_DRAM		(1 << 14)
+#define NHM_NON_DRAM		(1 << 15)
+
+#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE		(NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ		(NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+	},
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ		SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE		SNB_DMND_RFO
+#define SLM_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS		SNB_RESP_ANY
+#define SLM_LLC_MISS		(SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+	},
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+		[ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+		[ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
+#define KNL_MCDRAM_FAR		BIT_ULL(22)
+#define KNL_DDR_LOCAL		BIT_ULL(23)
+#define KNL_DDR_FAR		BIT_ULL(24)
+#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+				    KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ		SLM_DMND_READ
+#define KNL_L2_WRITE		SLM_DMND_WRITE
+#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS		SLM_LLC_ACCESS
+#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+				   KNL_DRAM_ANY | SNB_SNP_ANY | \
+						  SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = 0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+		},
+	},
+};
+
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		intel_pmu_disable_bts();
+	else
+		intel_bts_disable_local();
+
+	intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+	__intel_pmu_disable_all();
+	intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all(pmi);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		struct perf_event *event =
+			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+		if (WARN_ON_ONCE(!event))
+			return;
+
+		intel_pmu_enable_bts(event->hw.config);
+	} else
+		intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+	__intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * The official story:
+ *   These chips need to be 'reset' when adding counters by programming the
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	static const unsigned long nhm_magic[4] = {
+		0x4300B5,
+		0x4300D2,
+		0x4300B1,
+		0x4300B1
+	};
+	struct perf_event *event;
+	int i;
+
+	/*
+	 * The Errata requires below steps:
+	 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 2) Configure 4 PERFEVTSELx with the magic events and clear
+	 *    the corresponding PMCx;
+	 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+	 */
+
+	/*
+	 * The real steps we choose are a little different from above.
+	 * A) To reduce MSR operations, we don't run step 1) as they
+	 *    are already cleared before this function is called;
+	 * B) Call x86_perf_event_update to save PMCx before configuring
+	 *    PERFEVTSELx with magic number;
+	 * C) With step 5), we do clear only when the PERFEVTSELx is
+	 *    not used currently.
+	 * D) Call x86_perf_event_set_period to restore PMCx;
+	 */
+
+	/* We always operate 4 pairs of PERF Counters */
+	for (i = 0; i < 4; i++) {
+		event = cpuc->events[i];
+		if (event)
+			x86_perf_event_update(event);
+	}
+
+	for (i = 0; i < 4; i++) {
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+		wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+	}
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+	for (i = 0; i < 4; i++) {
+		event = cpuc->events[i];
+
+		if (event) {
+			x86_perf_event_set_period(event);
+			__x86_pmu_enable_event(&event->hw,
+					ARCH_PERFMON_EVENTSEL_ENABLE);
+		} else
+			wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+	}
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+	if (added)
+		intel_pmu_nhm_workaround();
+	intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+		intel_pmu_disable_bts();
+		intel_pmu_drain_bts_buffer();
+		return;
+	}
+
+	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+	/*
+	 * must disable before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_disable(event);
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+
+	if (unlikely(event->attr.precise_ip))
+		intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	/*
+	 * ANY bit is supported in v3 and up
+	 */
+	if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+		bits |= 0x4;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+		if (!__this_cpu_read(cpu_hw_events.enabled))
+			return;
+
+		intel_pmu_enable_bts(hwc->config);
+		return;
+	}
+	/*
+	 * must enabled before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_enable(event);
+
+	if (event->attr.exclude_host)
+		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+	if (event->attr.exclude_guest)
+		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+	if (unlikely(event_is_checkpointed(event)))
+		cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	if (unlikely(event->attr.precise_ip))
+		intel_pmu_pebs_enable(event);
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+	x86_perf_event_update(event);
+	/*
+	 * For a checkpointed counter always reset back to 0.  This
+	 * avoids a situation where the counter overflows, aborts the
+	 * transaction and is then set back to shortly before the
+	 * overflow, and overflows and aborts again.
+	 */
+	if (unlikely(event_is_checkpointed(event))) {
+		/* No race with NMIs because the counter should not be armed */
+		wrmsrl(event->hw.event_base, 0);
+		local64_set(&event->hw.prev_count, 0);
+	}
+	return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+	if (ds)
+		ds->bts_index = ds->bts_buffer_base;
+
+	/* Ack all overflows and disable fixed counters */
+	if (x86_pmu.version >= 2) {
+		intel_pmu_ack_status(intel_pmu_get_status());
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	}
+
+	/* Reset LBRs and LBR freezing */
+	if (x86_pmu.lbr_nr) {
+		update_debugctlmsr(get_debugctlmsr() &
+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit, loops;
+	u64 status;
+	int handled;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * No known reason to not always do late ACK,
+	 * but just in case do it opt-in.
+	 */
+	if (!x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__intel_pmu_disable_all();
+	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+
+	loops = 0;
+again:
+	intel_pmu_lbr_read();
+	intel_pmu_ack_status(status);
+	if (++loops > 100) {
+		static bool warned = false;
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+
+
+	/*
+	 * Ignore a range of extra bits in status that do not indicate
+	 * overflow by themselves.
+	 */
+	status &= ~(GLOBAL_STATUS_COND_CHG |
+		    GLOBAL_STATUS_ASIF |
+		    GLOBAL_STATUS_LBRS_FROZEN);
+	if (!status)
+		goto done;
+
+	/*
+	 * PEBS overflow sets bit 62 in the global status register
+	 */
+	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+		handled++;
+		x86_pmu.drain_pebs(regs);
+	}
+
+	/*
+	 * Intel PT
+	 */
+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+		handled++;
+		intel_pt_interrupt();
+	}
+
+	/*
+	 * Checkpointed counters can lead to 'spurious' PMIs because the
+	 * rollback caused by the PMI will have cleared the overflow status
+	 * bit. Therefore always force probe these counters.
+	 */
+	status |= cpuc->intel_cp_status;
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (has_branch_stack(event))
+			data.br_stack = &cpuc->lbr_stack;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = intel_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	__intel_pmu_enable_all(0, true);
+	/*
+	 * Only unmask the NMI after the overflow counters
+	 * have been reset. This avoids spurious NMIs on
+	 * Haswell CPUs.
+	 */
+	if (x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int hw_event, bts_event;
+
+	if (event->attr.freq)
+		return NULL;
+
+	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+		return &bts_constraint;
+
+	return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+	int alt_idx = idx;
+
+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+		return idx;
+
+	if (idx == EXTRA_REG_RSP_0)
+		alt_idx = EXTRA_REG_RSP_1;
+
+	if (idx == EXTRA_REG_RSP_1)
+		alt_idx = EXTRA_REG_RSP_0;
+
+	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+		return idx;
+
+	return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+	event->hw.extra_reg.idx = idx;
+
+	if (idx == EXTRA_REG_RSP_0) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+	} else if (idx == EXTRA_REG_RSP_1) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+	}
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+				   struct perf_event *event,
+				   struct hw_perf_event_extra *reg)
+{
+	struct event_constraint *c = &emptyconstraint;
+	struct er_account *era;
+	unsigned long flags;
+	int idx = reg->idx;
+
+	/*
+	 * reg->alloc can be set due to existing state, so for fake cpuc we
+	 * need to ignore this, otherwise we might fail to allocate proper fake
+	 * state for this extra reg constraint. Also see the comment below.
+	 */
+	if (reg->alloc && !cpuc->is_fake)
+		return NULL; /* call x86_get_event_constraint() */
+
+again:
+	era = &cpuc->shared_regs->regs[idx];
+	/*
+	 * we use spin_lock_irqsave() to avoid lockdep issues when
+	 * passing a fake cpuc
+	 */
+	raw_spin_lock_irqsave(&era->lock, flags);
+
+	if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+		/*
+		 * If its a fake cpuc -- as per validate_{group,event}() we
+		 * shouldn't touch event state and we can avoid doing so
+		 * since both will only call get_event_constraints() once
+		 * on each event, this avoids the need for reg->alloc.
+		 *
+		 * Not doing the ER fixup will only result in era->reg being
+		 * wrong, but since we won't actually try and program hardware
+		 * this isn't a problem either.
+		 */
+		if (!cpuc->is_fake) {
+			if (idx != reg->idx)
+				intel_fixup_er(event, idx);
+
+			/*
+			 * x86_schedule_events() can call get_event_constraints()
+			 * multiple times on events in the case of incremental
+			 * scheduling(). reg->alloc ensures we only do the ER
+			 * allocation once.
+			 */
+			reg->alloc = 1;
+		}
+
+		/* lock in msr value */
+		era->config = reg->config;
+		era->reg = reg->reg;
+
+		/* one more user */
+		atomic_inc(&era->ref);
+
+		/*
+		 * need to call x86_get_event_constraint()
+		 * to check if associated event has constraints
+		 */
+		c = NULL;
+	} else {
+		idx = intel_alt_er(idx, reg->config);
+		if (idx != reg->idx) {
+			raw_spin_unlock_irqrestore(&era->lock, flags);
+			goto again;
+		}
+	}
+	raw_spin_unlock_irqrestore(&era->lock, flags);
+
+	return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
+{
+	struct er_account *era;
+
+	/*
+	 * Only put constraint if extra reg was actually allocated. Also takes
+	 * care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake cpuc we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+	 * either since it'll be thrown out.
+	 */
+	if (!reg->alloc || cpuc->is_fake)
+		return;
+
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	/* one fewer user */
+	atomic_dec(&era->ref);
+
+	/* allocate again next time */
+	reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	struct event_constraint *c = NULL, *d;
+	struct hw_perf_event_extra *xreg, *breg;
+
+	xreg = &event->hw.extra_reg;
+	if (xreg->idx != EXTRA_REG_NONE) {
+		c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+		if (c == &emptyconstraint)
+			return c;
+	}
+	breg = &event->hw.branch_reg;
+	if (breg->idx != EXTRA_REG_NONE) {
+		d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+		if (d == &emptyconstraint) {
+			__intel_shared_reg_put_constraints(cpuc, xreg);
+			c = d;
+		}
+	}
+	return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code) {
+				event->hw.flags |= c->flags;
+				return c;
+			}
+		}
+	}
+
+	return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = intel_bts_constraints(event);
+	if (c)
+		return c;
+
+	c = intel_shared_regs_constraints(cpuc, event);
+	if (c)
+		return c;
+
+	c = intel_pebs_constraints(event);
+	if (c)
+		return c;
+
+	return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = true;
+	/*
+	 * lock shared state until we are done scheduling
+	 * in stop_event_scheduling()
+	 * makes scheduling appear as a transaction
+	 */
+	raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = cpuc->event_constraint[idx];
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	lockdep_assert_held(&excl_cntrs->lock);
+
+	if (c->flags & PERF_X86_EVENT_EXCL)
+		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+	else
+		xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = false;
+	/*
+	 * release shared state lock (acquired in intel_start_scheduling())
+	 */
+	raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			   int idx, struct event_constraint *c)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xlo;
+	int tid = cpuc->excl_thread_id;
+	int is_excl, i;
+
+	/*
+	 * validating a group does not require
+	 * enforcing cross-thread  exclusion
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return c;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return c;
+
+	/*
+	 * because we modify the constraint, we need
+	 * to make a copy. Static constraints come
+	 * from static const tables.
+	 *
+	 * only needed when constraint has not yet
+	 * been cloned (marked dynamic)
+	 */
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+		struct event_constraint *cx;
+
+		/*
+		 * grab pre-allocated constraint entry
+		 */
+		cx = &cpuc->constraint_list[idx];
+
+		/*
+		 * initialize dynamic constraint
+		 * with static constraint
+		 */
+		*cx = *c;
+
+		/*
+		 * mark constraint as dynamic, so we
+		 * can free it later on
+		 */
+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+		c = cx;
+	}
+
+	/*
+	 * From here on, the constraint is dynamic.
+	 * Either it was just allocated above, or it
+	 * was allocated during a earlier invocation
+	 * of this function
+	 */
+
+	/*
+	 * state of sibling HT
+	 */
+	xlo = &excl_cntrs->states[tid ^ 1];
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+		if (!cpuc->n_excl++)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+	}
+
+	/*
+	 * Modify static constraint with current dynamic
+	 * state of thread
+	 *
+	 * EXCLUSIVE: sibling counter measuring exclusive event
+	 * SHARED   : sibling counter measuring non-exclusive event
+	 * UNUSED   : sibling counter unused
+	 */
+	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+		/*
+		 * exclusive event in sibling counter
+		 * our corresponding counter cannot be used
+		 * regardless of our event
+		 */
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, c->idxmsk);
+		/*
+		 * if measuring an exclusive event, sibling
+		 * measuring non-exclusive, then counter cannot
+		 * be used
+		 */
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, c->idxmsk);
+	}
+
+	/*
+	 * recompute actual bit weight for scheduling algorithm
+	 */
+	c->weight = hweight64(c->idxmsk64);
+
+	/*
+	 * if we return an empty mask, then switch
+	 * back to static empty constraint to avoid
+	 * the cost of freeing later on
+	 */
+	if (c->weight == 0)
+		c = &emptyconstraint;
+
+	return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c1 = NULL;
+	struct event_constraint *c2;
+
+	if (idx >= 0) /* fake does < 0 */
+		c1 = cpuc->event_constraint[idx];
+
+	/*
+	 * first time only
+	 * - static constraint: no change across incremental scheduling calls
+	 * - dynamic constraint: handled by intel_get_excl_constraints()
+	 */
+	c2 = __intel_get_event_constraints(cpuc, idx, event);
+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+		c1->weight = c2->weight;
+		c2 = c1;
+	}
+
+	if (cpuc->excl_cntrs)
+		return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+	return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+		struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	int tid = cpuc->excl_thread_id;
+	struct intel_excl_states *xl;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+		if (!--cpuc->n_excl)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+	}
+
+	/*
+	 * If event was actually assigned, then mark the counter state as
+	 * unused now.
+	 */
+	if (hwc->idx >= 0) {
+		xl = &excl_cntrs->states[tid];
+
+		/*
+		 * put_constraint may be called from x86_schedule_events()
+		 * which already has the lock held so here make locking
+		 * conditional.
+		 */
+		if (!xl->sched_started)
+			raw_spin_lock(&excl_cntrs->lock);
+
+		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+		if (!xl->sched_started)
+			raw_spin_unlock(&excl_cntrs->lock);
+	}
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+
+	reg = &event->hw.extra_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+
+	reg = &event->hw.branch_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	intel_put_shared_regs_event_constraints(cpuc, event);
+
+	/*
+	 * is PMU has exclusive counter restrictions, then
+	 * all events are subject to and must call the
+	 * put_excl_constraints() routine
+	 */
+	if (cpuc->excl_cntrs)
+		intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.ANY_P
+		 * (0x00c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * INST_RETIRED.ANY_P counts the number of cycles that retires
+		 * CNTMASK instructions. By setting CNTMASK to a value (16)
+		 * larger than the maximum number of instructions that can be
+		 * retired per cycle (4) and then inverting the condition, we
+		 * count all cycles that retire 16 or less instructions, which
+		 * is every cycle.
+		 *
+		 * Thereby we gain a PEBS capable cycle counter.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use UOPS_RETIRED.ALL
+		 * (0x01c2), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * UOPS_RETIRED.ALL counts the number of cycles that retires
+		 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+		 * larger than the maximum number of micro-ops that can be
+		 * retired per cycle (4) and then inverting the condition, we
+		 * count all cycles that retire 16 or less micro-ops, which
+		 * is every cycle.
+		 *
+		 * Thereby we gain a PEBS capable cycle counter.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+		 * (0x01c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * The PREC_DIST event has special support to minimize sample
+		 * shadowing effects. One drawback is that it can be
+		 * only programmed on counter 1, but that seems like an
+		 * acceptable trade off.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_snb(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_core2(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+	unsigned long flags = x86_pmu.free_running_flags;
+
+	if (event->attr.use_clockid)
+		flags &= ~PERF_SAMPLE_TIME;
+	return flags;
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.precise_ip) {
+		if (!event->attr.freq) {
+			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+			if (!(event->attr.sample_type &
+			      ~intel_pmu_free_running_flags(event)))
+				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+		}
+		if (x86_pmu.pebs_aliases)
+			x86_pmu.pebs_aliases(event);
+	}
+
+	if (needs_branch_stack(event)) {
+		ret = intel_pmu_setup_lbr_filter(event);
+		if (ret)
+			return ret;
+
+		/*
+		 * BTS is set up earlier in this path, so don't account twice
+		 */
+		if (!intel_pmu_has_bts(event)) {
+			/* disallow lbr if conflicting events are present */
+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+				return -EBUSY;
+
+			event->destroy = hw_perf_lbr_event_destroy;
+		}
+	}
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+		return 0;
+
+	if (x86_pmu.version < 3)
+		return -EINVAL;
+
+	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+	return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+	if (x86_pmu.guest_get_msrs)
+		return x86_pmu.guest_get_msrs(nr);
+	*nr = 0;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+	/*
+	 * If PMU counter has PEBS enabled it is not enough to disable counter
+	 * on a guest entry since PEBS memory write can overshoot guest entry
+	 * and corrupt guest memory. Disabling PEBS solves the problem.
+	 */
+	arr[1].msr = MSR_IA32_PEBS_ENABLE;
+	arr[1].host = cpuc->pebs_enabled;
+	arr[1].guest = 0;
+
+	*nr = 2;
+	return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+		struct perf_event *event = cpuc->events[idx];
+
+		arr[idx].msr = x86_pmu_config_addr(idx);
+		arr[idx].host = arr[idx].guest = 0;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		arr[idx].host = arr[idx].guest =
+			event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+		if (event->attr.exclude_host)
+			arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		else if (event->attr.exclude_guest)
+			arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+	}
+
+	*nr = x86_pmu.num_counters;
+	return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+	if (!event->attr.exclude_host)
+		x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+		if (!test_bit(idx, cpuc->active_mask) ||
+				cpuc->events[idx]->attr.exclude_host)
+			continue;
+
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	}
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+	int ret = intel_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+	if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+		return 0;
+	event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+	/*
+	 * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+	 * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+	 * this combination.
+	 */
+	if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+	     ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+	      event->attr.precise_ip > 0))
+		return -EOPNOTSUPP;
+
+	if (event_is_checkpointed(event)) {
+		/*
+		 * Sampling of checkpointed events can cause situations where
+		 * the CPU constantly aborts because of a overflow, which is
+		 * then checkpointed back and ignored. Forbid checkpointing
+		 * for sampling.
+		 *
+		 * But still allow a long sampling period, so that perf stat
+		 * from KVM works.
+		 */
+		if (event->attr.sample_period > 0 &&
+		    event->attr.sample_period < 0x7fffffff)
+			return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static struct event_constraint counter2_constraint =
+			EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
+
+	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
+	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+		if (c->idxmsk64 & (1U << 2))
+			return &counter2_constraint;
+		return &emptyconstraint;
+	}
+
+	return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(any,	"config:21"	); /* v3 + */
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+PMU_FORMAT_ATTR(in_tx,  "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	struct intel_shared_regs *regs;
+	int i;
+
+	regs = kzalloc_node(sizeof(struct intel_shared_regs),
+			    GFP_KERNEL, cpu_to_node(cpu));
+	if (regs) {
+		/*
+		 * initialize the locks to keep lockdep happy
+		 */
+		for (i = 0; i < EXTRA_REG_MAX; i++)
+			raw_spin_lock_init(&regs->regs[i].lock);
+
+		regs->core_id = -1;
+	}
+	return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+	struct intel_excl_cntrs *c;
+
+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+			 GFP_KERNEL, cpu_to_node(cpu));
+	if (c) {
+		raw_spin_lock_init(&c->lock);
+		c->core_id = -1;
+	}
+	return c;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto err;
+	}
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+		if (!cpuc->constraint_list)
+			goto err_shared_regs;
+
+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+		if (!cpuc->excl_cntrs)
+			goto err_constraint_list;
+
+		cpuc->excl_thread_id = 0;
+	}
+
+	return NOTIFY_OK;
+
+err_constraint_list:
+	kfree(cpuc->constraint_list);
+	cpuc->constraint_list = NULL;
+
+err_shared_regs:
+	kfree(cpuc->shared_regs);
+	cpuc->shared_regs = NULL;
+
+err:
+	return NOTIFY_BAD;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int core_id = topology_core_id(cpu);
+	int i;
+
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+
+	cpuc->lbr_sel = NULL;
+
+	if (!cpuc->shared_regs)
+		return;
+
+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+			struct intel_shared_regs *pc;
+
+			pc = per_cpu(cpu_hw_events, i).shared_regs;
+			if (pc && pc->core_id == core_id) {
+				cpuc->kfree_on_online[0] = cpuc->shared_regs;
+				cpuc->shared_regs = pc;
+				break;
+			}
+		}
+		cpuc->shared_regs->core_id = core_id;
+		cpuc->shared_regs->refcnt++;
+	}
+
+	if (x86_pmu.lbr_sel_map)
+		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+			struct intel_excl_cntrs *c;
+
+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
+			if (c && c->core_id == core_id) {
+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+				cpuc->excl_cntrs = c;
+				cpuc->excl_thread_id = 1;
+				break;
+			}
+		}
+		cpuc->excl_cntrs->core_id = core_id;
+		cpuc->excl_cntrs->refcnt++;
+	}
+}
+
+static void free_excl_cntrs(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_excl_cntrs *c;
+
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+		kfree(cpuc->constraint_list);
+		cpuc->constraint_list = NULL;
+	}
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_shared_regs *pc;
+
+	pc = cpuc->shared_regs;
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->shared_regs = NULL;
+	}
+
+	free_excl_cntrs(cpu);
+
+	fini_debug_store_on_cpu(cpu);
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+				 bool sched_in)
+{
+	if (x86_pmu.pebs_active)
+		intel_pmu_pebs_sched_task(ctx, sched_in);
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_any.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	&format_attr_in_tx.attr,
+	&format_attr_in_tx_cp.attr,
+
+	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+	&format_attr_ldlat.attr, /* PEBS load latency */
+	NULL,
+};
+
+static struct attribute *skl_format_attr[] = {
+	&format_attr_frontend.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+	.name			= "core",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= core_pmu_enable_all,
+	.enable			= core_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32-bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL<<31) - 1,
+	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
+	.event_constraints	= intel_core_event_constraints,
+	.guest_get_msrs		= core_guest_get_msrs,
+	.format_attrs		= intel_arch_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+	/*
+	 * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+	 * together with PMU version 1 and thus be using core_pmu with
+	 * shared_regs. We need following callbacks here to allocate
+	 * it properly.
+	 */
+	.cpu_prepare		= intel_pmu_cpu_prepare,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+	.name			= "Intel",
+	.handle_irq		= intel_pmu_handle_irq,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
+	.enable			= intel_pmu_enable_event,
+	.disable		= intel_pmu_disable_event,
+	.hw_config		= intel_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
+	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
+	.pebs_aliases		= intel_pebs_aliases_core2,
+
+	.format_attrs		= intel_arch3_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+	.cpu_prepare		= intel_pmu_cpu_prepare,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
+	.guest_get_msrs		= intel_guest_get_msrs,
+	.sched_task		= intel_pmu_sched_task,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+	/*
+	 * PEBS is unreliable due to:
+	 *
+	 *   AJ67  - PEBS may experience CPL leaks
+	 *   AJ68  - PEBS PMI may be delayed by one event
+	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+	 *
+	 * AJ67 could be worked around by restricting the OS/USR flags.
+	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+	 *
+	 * AJ106 could possibly be worked around by not allowing LBR
+	 *       usage from PEBS, including the fixup.
+	 * AJ68  could possibly be worked around by always programming
+	 *	 a pebs_event_reset[0] value and coping with the lost events.
+	 *
+	 * But taken together it might just make sense to not enable PEBS on
+	 * these chips.
+	 */
+	pr_warn("PEBS disabled due to CPU errata\n");
+	x86_pmu.pebs = 0;
+	x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+	u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+	switch (cpu_data(cpu).x86_model) {
+	case 42: /* SNB */
+		rev = 0x28;
+		break;
+
+	case 45: /* SNB-EP */
+		switch (cpu_data(cpu).x86_mask) {
+		case 6: rev = 0x618; break;
+		case 7: rev = 0x70c; break;
+		}
+	}
+
+	return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+	int pebs_broken = 0;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+			break;
+	}
+	put_online_cpus();
+
+	if (pebs_broken == x86_pmu.pebs_broken)
+		return;
+
+	/*
+	 * Serialized by the microcode lock..
+	 */
+	if (x86_pmu.pebs_broken) {
+		pr_info("PEBS enabled due to microcode update\n");
+		x86_pmu.pebs_broken = 0;
+	} else {
+		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+		x86_pmu.pebs_broken = 1;
+	}
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+	u64 val_old, val_new, val_tmp;
+
+	/*
+	 * Read the current value, change it and read it back to see if it
+	 * matches, this is needed to detect certain hardware emulators
+	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+	 */
+	if (rdmsrl_safe(msr, &val_old))
+		return false;
+
+	/*
+	 * Only change the bits which can be updated by wrmsrl.
+	 */
+	val_tmp = val_old ^ mask;
+	if (wrmsrl_safe(msr, val_tmp) ||
+	    rdmsrl_safe(msr, &val_new))
+		return false;
+
+	if (val_new != val_tmp)
+		return false;
+
+	/* Here it's sure that the MSR can be safely accessed.
+	 * Restore the old value and return.
+	 */
+	wrmsrl(msr, val_old);
+
+	return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+	x86_pmu.check_microcode = intel_snb_check_microcode;
+	intel_snb_check_microcode();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			intel_arch_events_map[bit].name);
+	}
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+	union cpuid10_ebx ebx;
+
+	ebx.full = x86_pmu.events_maskl;
+	if (ebx.split.no_branch_misses_retired) {
+		/*
+		 * Erratum AAJ80 detected, we work it around by using
+		 * the BR_MISP_EXEC.ANY event. This will over-count
+		 * branch-misses, but it's still much better than the
+		 * architectural event which is often completely bogus:
+		 */
+		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+		ebx.split.no_branch_misses_retired = 0;
+		x86_pmu.events_maskl = ebx.full;
+		pr_info("CPU erratum AAJ80 worked around\n");
+	}
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
+	x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+	EVENT_PTR(tx_start),
+	EVENT_PTR(tx_commit),
+	EVENT_PTR(tx_abort),
+	EVENT_PTR(tx_capacity),
+	EVENT_PTR(tx_conflict),
+	EVENT_PTR(el_start),
+	EVENT_PTR(el_commit),
+	EVENT_PTR(el_abort),
+	EVENT_PTR(el_capacity),
+	EVENT_PTR(el_conflict),
+	EVENT_PTR(cycles_t),
+	EVENT_PTR(cycles_ct),
+	EVENT_PTR(mem_ld_hsw),
+	EVENT_PTR(mem_st_hsw),
+	NULL
+};
+
+__init int intel_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	struct extra_reg *er;
+	int version, i;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+		switch (boot_cpu_data.x86) {
+		case 0x6:
+			return p6_pmu_init();
+		case 0xb:
+			return knc_pmu_init();
+		case 0xf:
+			return p4_pmu_init();
+		}
+		return -ENODEV;
+	}
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version < 2)
+		x86_pmu = core_pmu;
+	else
+		x86_pmu = intel_pmu;
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose events, so
+	 * assume at least 3 events:
+	 */
+	if (version > 1)
+		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+	if (boot_cpu_has(X86_FEATURE_PDCM)) {
+		u64 capabilities;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+		x86_pmu.intel_cap.capabilities = capabilities;
+	}
+
+	intel_ds_init();
+
+	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+	/*
+	 * Install the hw-cache-events table:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 14: /* 65nm Core "Yonah" */
+		pr_cont("Core events, ");
+		break;
+
+	case 15: /* 65nm Core2 "Merom"          */
+		x86_add_quirk(intel_clovertown_quirk);
+	case 22: /* 65nm Core2 "Merom-L"        */
+	case 23: /* 45nm Core2 "Penryn"         */
+	case 29: /* 45nm Core2 "Dunnington (MP) */
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		intel_pmu_lbr_init_core();
+
+		x86_pmu.event_constraints = intel_core2_event_constraints;
+		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+		pr_cont("Core2 events, ");
+		break;
+
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_nhm();
+
+		x86_pmu.event_constraints = intel_nehalem_event_constraints;
+		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+		x86_pmu.cpu_events = nhm_events_attrs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+		x86_add_quirk(intel_nehalem_quirk);
+
+		pr_cont("Nehalem events, ");
+		break;
+
+	case 28: /* 45nm Atom "Pineview"   */
+	case 38: /* 45nm Atom "Lincroft"   */
+	case 39: /* 32nm Atom "Penwell"    */
+	case 53: /* 32nm Atom "Cloverview" */
+	case 54: /* 32nm Atom "Cedarview"  */
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		intel_pmu_lbr_init_atom();
+
+		x86_pmu.event_constraints = intel_gen_event_constraints;
+		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+		pr_cont("Atom events, ");
+		break;
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 76: /* 14nm Atom "Airmont"                   */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+			sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_atom();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_slm_extra_regs;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		pr_cont("Silvermont events, ");
+		break;
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_nhm();
+
+		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_westmere_extra_regs;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+		x86_pmu.cpu_events = nhm_events_attrs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+		pr_cont("Westmere events, ");
+		break;
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+		x86_add_quirk(intel_sandybridge_quirk);
+		x86_add_quirk(intel_ht_bug);
+		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_snb_event_constraints;
+		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		if (boot_cpu_data.x86_model == 45)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.cpu_events = snb_events_attrs;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+		pr_cont("SandyBridge events, ");
+		break;
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+		x86_add_quirk(intel_ht_bug);
+		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		/* dTLB-load-misses on IVB is different than SNB */
+		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_ivb_event_constraints;
+		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		if (boot_cpu_data.x86_model == 62)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.cpu_events = snb_events_attrs;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+		pr_cont("IvyBridge events, ");
+		break;
+
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+		x86_add_quirk(intel_ht_bug);
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_hsw();
+
+		x86_pmu.event_constraints = intel_hsw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.lbr_double_abort = true;
+		pr_cont("Haswell events, ");
+		break;
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_hsw();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
+		pr_cont("Broadwell events, ");
+		break;
+
+	case 87: /* Knights Landing Xeon Phi */
+		memcpy(hw_cache_event_ids,
+		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs,
+		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_knl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_knl_extra_regs;
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		pr_cont("Knights Landing events, ");
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_skl_event_constraints;
+		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_skl_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+						  skl_format_attr);
+		WARN_ON(!x86_pmu.format_attrs);
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Skylake events, ");
+		break;
+
+	default:
+		switch (x86_pmu.version) {
+		case 1:
+			x86_pmu.event_constraints = intel_v1_event_constraints;
+			pr_cont("generic architected perfmon v1, ");
+			break;
+		default:
+			/*
+			 * default constraints for v2 and up
+			 */
+			x86_pmu.event_constraints = intel_gen_event_constraints;
+			pr_cont("generic architected perfmon, ");
+			break;
+		}
+	}
+
+	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+	}
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+	}
+
+	x86_pmu.intel_ctrl |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		/*
+		 * event on fixed counter2 (REF_CYCLES) only works on this
+		 * counter, so do not extend mask to generic counters
+		 */
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if (c->cmask == FIXED_EVENT_FLAGS
+			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			}
+			c->idxmsk64 &=
+				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+			c->weight = hweight64(c->idxmsk64);
+		}
+	}
+
+	/*
+	 * Access LBR MSR may cause #GP under certain circumstances.
+	 * E.g. KVM doesn't support LBR MSR
+	 * Check all LBT MSR here.
+	 * Disable LBR access if any LBR MSRs can not be accessed.
+	 */
+	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+		x86_pmu.lbr_nr = 0;
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+		      check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+			x86_pmu.lbr_nr = 0;
+	}
+
+	/*
+	 * Access extra MSR may cause #GP under certain circumstances.
+	 * E.g. KVM doesn't support offcore event
+	 * Check all extra_regs here.
+	 */
+	if (x86_pmu.extra_regs) {
+		for (er = x86_pmu.extra_regs; er->msr; er++) {
+			er->extra_msr_access = check_msr(er->msr, 0x11UL);
+			/* Disable LBR select mapping */
+			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+				x86_pmu.lbr_sel_map = NULL;
+		}
+	}
+
+	/* Support full width counters using alternative MSR range */
+	if (x86_pmu.intel_cap.full_width_write) {
+		x86_pmu.max_period = x86_pmu.cntval_mask;
+		x86_pmu.perfctr = MSR_IA32_PMC0;
+		pr_cont("full-width counters, ");
+	}
+
+	return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+	int cpu = smp_processor_id();
+	int w, c;
+	/*
+	 * problem not present on this CPU model, nothing to do
+	 */
+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+		return 0;
+
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
+	if (w > 1) {
+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+		return 0;
+	}
+
+	if (lockup_detector_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
+
+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+	x86_pmu.start_scheduling = NULL;
+	x86_pmu.commit_scheduling = NULL;
+	x86_pmu.stop_scheduling = NULL;
+
+	lockup_detector_resume();
+
+	get_online_cpus();
+
+	for_each_online_cpu(c) {
+		free_excl_cntrs(c);
+	}
+
+	put_online_cpus();
+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+	return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e28f93158016..0a27243cb6c3 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
deleted file mode 100644
index fed2ab1f1065..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,3773 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/nmi.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-	INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-	INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
-	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-	INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
-	INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
-	INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
-	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
-	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
-	INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
-	INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
-	INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
-	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-	EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
-	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-	INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
-	INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-	EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_slm_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-	INTEL_UEVENT_EXTRA_REG(0x01b7,
-			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7,
-			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	/*
-	 * Note the low 8 bits eventsel code is not a continuous field, containing
-	 * some #GPing bits. These are masked out.
-	 */
-	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
-	EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
-	EVENT_PTR(mem_ld_nhm),
-	NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
-	EVENT_PTR(mem_ld_snb),
-	EVENT_PTR(mem_st_snb),
-	NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
-	/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
-	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_bdw_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
-	EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-	return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts.
- * - icache miss does not include decoded icache
- */
-
-#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
-#define SKL_DEMAND_RFO			BIT_ULL(1)
-#define SKL_ANY_RESPONSE		BIT_ULL(16)
-#define SKL_SUPPLIER_NONE		BIT_ULL(17)
-#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
-#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
-#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
-#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
-#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-#define SKL_SPL_HIT			BIT_ULL(30)
-#define SKL_SNOOP_NONE			BIT_ULL(31)
-#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
-#define SKL_SNOOP_MISS			BIT_ULL(33)
-#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
-#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
-#define SKL_SNOOP_HITM			BIT_ULL(36)
-#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
-#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
-					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
-#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
-#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
-					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-					 SKL_SNOOP_HITM|SKL_SPL_HIT)
-#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
-#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
-#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-
-static __initconst const u64 skl_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
-		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 skl_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS|SKL_ANY_SNOOP|
-				       SKL_SUPPLIER_NONE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS|SKL_ANY_SNOOP|
-				       SKL_SUPPLIER_NONE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-#define SNB_DMND_DATA_RD	(1ULL << 0)
-#define SNB_DMND_RFO		(1ULL << 1)
-#define SNB_DMND_IFETCH		(1ULL << 2)
-#define SNB_DMND_WB		(1ULL << 3)
-#define SNB_PF_DATA_RD		(1ULL << 4)
-#define SNB_PF_RFO		(1ULL << 5)
-#define SNB_PF_IFETCH		(1ULL << 6)
-#define SNB_LLC_DATA_RD		(1ULL << 7)
-#define SNB_LLC_RFO		(1ULL << 8)
-#define SNB_LLC_IFETCH		(1ULL << 9)
-#define SNB_BUS_LOCKS		(1ULL << 10)
-#define SNB_STRM_ST		(1ULL << 11)
-#define SNB_OTHER		(1ULL << 15)
-#define SNB_RESP_ANY		(1ULL << 16)
-#define SNB_NO_SUPP		(1ULL << 17)
-#define SNB_LLC_HITM		(1ULL << 18)
-#define SNB_LLC_HITE		(1ULL << 19)
-#define SNB_LLC_HITS		(1ULL << 20)
-#define SNB_LLC_HITF		(1ULL << 21)
-#define SNB_LOCAL		(1ULL << 22)
-#define SNB_REMOTE		(0xffULL << 23)
-#define SNB_SNP_NONE		(1ULL << 31)
-#define SNB_SNP_NOT_NEEDED	(1ULL << 32)
-#define SNB_SNP_MISS		(1ULL << 33)
-#define SNB_NO_FWD		(1ULL << 34)
-#define SNB_SNP_FWD		(1ULL << 35)
-#define SNB_HITM		(1ULL << 36)
-#define SNB_NON_DRAM		(1ULL << 37)
-
-#define SNB_DMND_READ		(SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-				 SNB_HITM)
-
-#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS		SNB_RESP_ANY
-#define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
-	},
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
-		[ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-
-};
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts because they are not
- *   reliably counted.
- */
-
-#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
-#define HSW_DEMAND_RFO			BIT_ULL(1)
-#define HSW_ANY_RESPONSE		BIT_ULL(16)
-#define HSW_SUPPLIER_NONE		BIT_ULL(17)
-#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
-#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
-#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
-#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
-#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
-					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-					 HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_SNOOP_NONE			BIT_ULL(31)
-#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
-#define HSW_SNOOP_MISS			BIT_ULL(33)
-#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
-#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
-#define HSW_SNOOP_HITM			BIT_ULL(36)
-#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
-#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
-					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
-					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
-					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
-#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
-#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
-#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
-#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
-					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
-
-#define BDW_L3_MISS_LOCAL		BIT(26)
-#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
-					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-					 HSW_L3_MISS_REMOTE_HOP2P)
-
-
-static __initconst const u64 hsw_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
-		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 hsw_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-				       HSW_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS|HSW_ANY_SNOOP,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-				       HSW_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS|HSW_ANY_SNOOP,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS_LOCAL_DRAM|
-				       HSW_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS_REMOTE|
-				       HSW_SNOOP_DRAM,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS_LOCAL_DRAM|
-				       HSW_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS_REMOTE|
-				       HSW_SNOOP_DRAM,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	/*
-	 * Use RFO, not WRITEBACK, because a write miss would typically occur
-	 * on RFO.
-	 */
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD	(1 << 0)
-#define NHM_DMND_RFO		(1 << 1)
-#define NHM_DMND_IFETCH		(1 << 2)
-#define NHM_DMND_WB		(1 << 3)
-#define NHM_PF_DATA_RD		(1 << 4)
-#define NHM_PF_DATA_RFO		(1 << 5)
-#define NHM_PF_IFETCH		(1 << 6)
-#define NHM_OFFCORE_OTHER	(1 << 7)
-#define NHM_UNCORE_HIT		(1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP	(1 << 9)
-#define NHM_OTHER_CORE_HITM	(1 << 10)
-        			/* reserved */
-#define NHM_REMOTE_CACHE_FWD	(1 << 12)
-#define NHM_REMOTE_DRAM		(1 << 13)
-#define NHM_LOCAL_DRAM		(1 << 14)
-#define NHM_NON_DRAM		(1 << 15)
-
-#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE		(NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ		(NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
-	},
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	/*
-	 * Use RFO, not WRITEBACK, because a write miss would typically occur
-	 * on RFO.
-	 */
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static struct extra_reg intel_slm_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
-	EVENT_EXTRA_END
-};
-
-#define SLM_DMND_READ		SNB_DMND_DATA_RD
-#define SLM_DMND_WRITE		SNB_DMND_RFO
-#define SLM_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SLM_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
-#define SLM_LLC_ACCESS		SNB_RESP_ANY
-#define SLM_LLC_MISS		(SLM_SNP_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 slm_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
-	},
- },
-};
-
-static __initconst const u64 slm_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
-		[ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
-		[ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
-#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
-#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
-#define KNL_MCDRAM_FAR		BIT_ULL(22)
-#define KNL_DDR_LOCAL		BIT_ULL(23)
-#define KNL_DDR_FAR		BIT_ULL(24)
-#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
-				    KNL_DDR_LOCAL | KNL_DDR_FAR)
-#define KNL_L2_READ		SLM_DMND_READ
-#define KNL_L2_WRITE		SLM_DMND_WRITE
-#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
-#define KNL_L2_ACCESS		SLM_LLC_ACCESS
-#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
-				   KNL_DRAM_ANY | SNB_SNP_ANY | \
-						  SNB_NON_DRAM)
-
-static __initconst const u64 knl_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = 0,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
-		},
-	},
-};
-
-/*
- * Use from PMIs where the LBRs are already disabled.
- */
-static void __intel_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		intel_pmu_disable_bts();
-	else
-		intel_bts_disable_local();
-
-	intel_pmu_pebs_disable_all();
-}
-
-static void intel_pmu_disable_all(void)
-{
-	__intel_pmu_disable_all();
-	intel_pmu_lbr_disable_all();
-}
-
-static void __intel_pmu_enable_all(int added, bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all(pmi);
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		struct perf_event *event =
-			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
-		if (WARN_ON_ONCE(!event))
-			return;
-
-		intel_pmu_enable_bts(event->hw.config);
-	} else
-		intel_bts_enable_local();
-}
-
-static void intel_pmu_enable_all(int added)
-{
-	__intel_pmu_enable_all(added, false);
-}
-
-/*
- * Workaround for:
- *   Intel Errata AAK100 (model 26)
- *   Intel Errata AAP53  (model 30)
- *   Intel Errata BD53   (model 44)
- *
- * The official story:
- *   These chips need to be 'reset' when adding counters by programming the
- *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- *   in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	static const unsigned long nhm_magic[4] = {
-		0x4300B5,
-		0x4300D2,
-		0x4300B1,
-		0x4300B1
-	};
-	struct perf_event *event;
-	int i;
-
-	/*
-	 * The Errata requires below steps:
-	 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 2) Configure 4 PERFEVTSELx with the magic events and clear
-	 *    the corresponding PMCx;
-	 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
-	 */
-
-	/*
-	 * The real steps we choose are a little different from above.
-	 * A) To reduce MSR operations, we don't run step 1) as they
-	 *    are already cleared before this function is called;
-	 * B) Call x86_perf_event_update to save PMCx before configuring
-	 *    PERFEVTSELx with magic number;
-	 * C) With step 5), we do clear only when the PERFEVTSELx is
-	 *    not used currently.
-	 * D) Call x86_perf_event_set_period to restore PMCx;
-	 */
-
-	/* We always operate 4 pairs of PERF Counters */
-	for (i = 0; i < 4; i++) {
-		event = cpuc->events[i];
-		if (event)
-			x86_perf_event_update(event);
-	}
-
-	for (i = 0; i < 4; i++) {
-		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
-		wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
-	}
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
-	for (i = 0; i < 4; i++) {
-		event = cpuc->events[i];
-
-		if (event) {
-			x86_perf_event_set_period(event);
-			__x86_pmu_enable_event(&event->hw,
-					ARCH_PERFMON_EVENTSEL_ENABLE);
-		} else
-			wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
-	}
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
-	if (added)
-		intel_pmu_nhm_workaround();
-	intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
-	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-	u64 ctrl_val, mask;
-
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-		intel_pmu_disable_bts();
-		intel_pmu_drain_bts_buffer();
-		return;
-	}
-
-	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-	/*
-	 * must disable before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_disable(event);
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_disable_fixed(hwc);
-		return;
-	}
-
-	x86_pmu_disable_event(event);
-
-	if (unlikely(event->attr.precise_ip))
-		intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
-	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-	u64 ctrl_val, bits, mask;
-
-	/*
-	 * Enable IRQ generation (0x8),
-	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-	 * if requested:
-	 */
-	bits = 0x8ULL;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-		bits |= 0x2;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-		bits |= 0x1;
-
-	/*
-	 * ANY bit is supported in v3 and up
-	 */
-	if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
-		bits |= 0x4;
-
-	bits <<= (idx * 4);
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	ctrl_val |= bits;
-	wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-		if (!__this_cpu_read(cpu_hw_events.enabled))
-			return;
-
-		intel_pmu_enable_bts(hwc->config);
-		return;
-	}
-	/*
-	 * must enabled before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_enable(event);
-
-	if (event->attr.exclude_host)
-		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-	if (event->attr.exclude_guest)
-		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-	if (unlikely(event_is_checkpointed(event)))
-		cpuc->intel_cp_status |= (1ull << hwc->idx);
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_enable_fixed(hwc);
-		return;
-	}
-
-	if (unlikely(event->attr.precise_ip))
-		intel_pmu_pebs_enable(event);
-
-	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
-	x86_perf_event_update(event);
-	/*
-	 * For a checkpointed counter always reset back to 0.  This
-	 * avoids a situation where the counter overflows, aborts the
-	 * transaction and is then set back to shortly before the
-	 * overflow, and overflows and aborts again.
-	 */
-	if (unlikely(event_is_checkpointed(event))) {
-		/* No race with NMIs because the counter should not be armed */
-		wrmsrl(event->hw.event_base, 0);
-		local64_set(&event->hw.prev_count, 0);
-	}
-	return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
-	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-	unsigned long flags;
-	int idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
-		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
-	if (ds)
-		ds->bts_index = ds->bts_buffer_base;
-
-	/* Ack all overflows and disable fixed counters */
-	if (x86_pmu.version >= 2) {
-		intel_pmu_ack_status(intel_pmu_get_status());
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	}
-
-	/* Reset LBRs and LBR freezing */
-	if (x86_pmu.lbr_nr) {
-		update_debugctlmsr(get_debugctlmsr() &
-			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
-	}
-
-	local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int bit, loops;
-	u64 status;
-	int handled;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * No known reason to not always do late ACK,
-	 * but just in case do it opt-in.
-	 */
-	if (!x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__intel_pmu_disable_all();
-	handled = intel_pmu_drain_bts_buffer();
-	handled += intel_bts_interrupt();
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-
-	loops = 0;
-again:
-	intel_pmu_lbr_read();
-	intel_pmu_ack_status(status);
-	if (++loops > 100) {
-		static bool warned = false;
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-
-
-	/*
-	 * Ignore a range of extra bits in status that do not indicate
-	 * overflow by themselves.
-	 */
-	status &= ~(GLOBAL_STATUS_COND_CHG |
-		    GLOBAL_STATUS_ASIF |
-		    GLOBAL_STATUS_LBRS_FROZEN);
-	if (!status)
-		goto done;
-
-	/*
-	 * PEBS overflow sets bit 62 in the global status register
-	 */
-	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
-		handled++;
-		x86_pmu.drain_pebs(regs);
-	}
-
-	/*
-	 * Intel PT
-	 */
-	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
-		handled++;
-		intel_pt_interrupt();
-	}
-
-	/*
-	 * Checkpointed counters can lead to 'spurious' PMIs because the
-	 * rollback caused by the PMI will have cleared the overflow status
-	 * bit. Therefore always force probe these counters.
-	 */
-	status |= cpuc->intel_cp_status;
-
-	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_event *event = cpuc->events[bit];
-
-		handled++;
-
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (has_branch_stack(event))
-			data.br_stack = &cpuc->lbr_stack;
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = intel_pmu_get_status();
-	if (status)
-		goto again;
-
-done:
-	__intel_pmu_enable_all(0, true);
-	/*
-	 * Only unmask the NMI after the overflow counters
-	 * have been reset. This avoids spurious NMIs on
-	 * Haswell CPUs.
-	 */
-	if (x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int hw_event, bts_event;
-
-	if (event->attr.freq)
-		return NULL;
-
-	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
-	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
-		return &bts_constraint;
-
-	return NULL;
-}
-
-static int intel_alt_er(int idx, u64 config)
-{
-	int alt_idx = idx;
-
-	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
-		return idx;
-
-	if (idx == EXTRA_REG_RSP_0)
-		alt_idx = EXTRA_REG_RSP_1;
-
-	if (idx == EXTRA_REG_RSP_1)
-		alt_idx = EXTRA_REG_RSP_0;
-
-	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
-		return idx;
-
-	return alt_idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
-	event->hw.extra_reg.idx = idx;
-
-	if (idx == EXTRA_REG_RSP_0) {
-		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
-		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
-	} else if (idx == EXTRA_REG_RSP_1) {
-		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
-		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-	}
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-				   struct perf_event *event,
-				   struct hw_perf_event_extra *reg)
-{
-	struct event_constraint *c = &emptyconstraint;
-	struct er_account *era;
-	unsigned long flags;
-	int idx = reg->idx;
-
-	/*
-	 * reg->alloc can be set due to existing state, so for fake cpuc we
-	 * need to ignore this, otherwise we might fail to allocate proper fake
-	 * state for this extra reg constraint. Also see the comment below.
-	 */
-	if (reg->alloc && !cpuc->is_fake)
-		return NULL; /* call x86_get_event_constraint() */
-
-again:
-	era = &cpuc->shared_regs->regs[idx];
-	/*
-	 * we use spin_lock_irqsave() to avoid lockdep issues when
-	 * passing a fake cpuc
-	 */
-	raw_spin_lock_irqsave(&era->lock, flags);
-
-	if (!atomic_read(&era->ref) || era->config == reg->config) {
-
-		/*
-		 * If its a fake cpuc -- as per validate_{group,event}() we
-		 * shouldn't touch event state and we can avoid doing so
-		 * since both will only call get_event_constraints() once
-		 * on each event, this avoids the need for reg->alloc.
-		 *
-		 * Not doing the ER fixup will only result in era->reg being
-		 * wrong, but since we won't actually try and program hardware
-		 * this isn't a problem either.
-		 */
-		if (!cpuc->is_fake) {
-			if (idx != reg->idx)
-				intel_fixup_er(event, idx);
-
-			/*
-			 * x86_schedule_events() can call get_event_constraints()
-			 * multiple times on events in the case of incremental
-			 * scheduling(). reg->alloc ensures we only do the ER
-			 * allocation once.
-			 */
-			reg->alloc = 1;
-		}
-
-		/* lock in msr value */
-		era->config = reg->config;
-		era->reg = reg->reg;
-
-		/* one more user */
-		atomic_inc(&era->ref);
-
-		/*
-		 * need to call x86_get_event_constraint()
-		 * to check if associated event has constraints
-		 */
-		c = NULL;
-	} else {
-		idx = intel_alt_er(idx, reg->config);
-		if (idx != reg->idx) {
-			raw_spin_unlock_irqrestore(&era->lock, flags);
-			goto again;
-		}
-	}
-	raw_spin_unlock_irqrestore(&era->lock, flags);
-
-	return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
-				   struct hw_perf_event_extra *reg)
-{
-	struct er_account *era;
-
-	/*
-	 * Only put constraint if extra reg was actually allocated. Also takes
-	 * care of event which do not use an extra shared reg.
-	 *
-	 * Also, if this is a fake cpuc we shouldn't touch any event state
-	 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
-	 * either since it'll be thrown out.
-	 */
-	if (!reg->alloc || cpuc->is_fake)
-		return;
-
-	era = &cpuc->shared_regs->regs[reg->idx];
-
-	/* one fewer user */
-	atomic_dec(&era->ref);
-
-	/* allocate again next time */
-	reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
-			      struct perf_event *event)
-{
-	struct event_constraint *c = NULL, *d;
-	struct hw_perf_event_extra *xreg, *breg;
-
-	xreg = &event->hw.extra_reg;
-	if (xreg->idx != EXTRA_REG_NONE) {
-		c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
-		if (c == &emptyconstraint)
-			return c;
-	}
-	breg = &event->hw.branch_reg;
-	if (breg->idx != EXTRA_REG_NONE) {
-		d = __intel_shared_reg_get_constraints(cpuc, event, breg);
-		if (d == &emptyconstraint) {
-			__intel_shared_reg_put_constraints(cpuc, xreg);
-			c = d;
-		}
-	}
-	return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	if (x86_pmu.event_constraints) {
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
-				event->hw.flags |= c->flags;
-				return c;
-			}
-		}
-	}
-
-	return &unconstrained;
-}
-
-static struct event_constraint *
-__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			    struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	c = intel_bts_constraints(event);
-	if (c)
-		return c;
-
-	c = intel_shared_regs_constraints(cpuc, event);
-	if (c)
-		return c;
-
-	c = intel_pebs_constraints(event);
-	if (c)
-		return c;
-
-	return x86_get_event_constraints(cpuc, idx, event);
-}
-
-static void
-intel_start_scheduling(struct cpu_hw_events *cpuc)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	xl->sched_started = true;
-	/*
-	 * lock shared state until we are done scheduling
-	 * in stop_event_scheduling()
-	 * makes scheduling appear as a transaction
-	 */
-	raw_spin_lock(&excl_cntrs->lock);
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct event_constraint *c = cpuc->event_constraint[idx];
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	lockdep_assert_held(&excl_cntrs->lock);
-
-	if (c->flags & PERF_X86_EVENT_EXCL)
-		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
-	else
-		xl->state[cntr] = INTEL_EXCL_SHARED;
-}
-
-static void
-intel_stop_scheduling(struct cpu_hw_events *cpuc)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	xl->sched_started = false;
-	/*
-	 * release shared state lock (acquired in intel_start_scheduling())
-	 */
-	raw_spin_unlock(&excl_cntrs->lock);
-}
-
-static struct event_constraint *
-intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-			   int idx, struct event_constraint *c)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xlo;
-	int tid = cpuc->excl_thread_id;
-	int is_excl, i;
-
-	/*
-	 * validating a group does not require
-	 * enforcing cross-thread  exclusion
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return c;
-
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return c;
-
-	/*
-	 * because we modify the constraint, we need
-	 * to make a copy. Static constraints come
-	 * from static const tables.
-	 *
-	 * only needed when constraint has not yet
-	 * been cloned (marked dynamic)
-	 */
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-		struct event_constraint *cx;
-
-		/*
-		 * grab pre-allocated constraint entry
-		 */
-		cx = &cpuc->constraint_list[idx];
-
-		/*
-		 * initialize dynamic constraint
-		 * with static constraint
-		 */
-		*cx = *c;
-
-		/*
-		 * mark constraint as dynamic, so we
-		 * can free it later on
-		 */
-		cx->flags |= PERF_X86_EVENT_DYNAMIC;
-		c = cx;
-	}
-
-	/*
-	 * From here on, the constraint is dynamic.
-	 * Either it was just allocated above, or it
-	 * was allocated during a earlier invocation
-	 * of this function
-	 */
-
-	/*
-	 * state of sibling HT
-	 */
-	xlo = &excl_cntrs->states[tid ^ 1];
-
-	/*
-	 * event requires exclusive counter access
-	 * across HT threads
-	 */
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-		if (!cpuc->n_excl++)
-			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-	}
-
-	/*
-	 * Modify static constraint with current dynamic
-	 * state of thread
-	 *
-	 * EXCLUSIVE: sibling counter measuring exclusive event
-	 * SHARED   : sibling counter measuring non-exclusive event
-	 * UNUSED   : sibling counter unused
-	 */
-	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
-		/*
-		 * exclusive event in sibling counter
-		 * our corresponding counter cannot be used
-		 * regardless of our event
-		 */
-		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
-			__clear_bit(i, c->idxmsk);
-		/*
-		 * if measuring an exclusive event, sibling
-		 * measuring non-exclusive, then counter cannot
-		 * be used
-		 */
-		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
-			__clear_bit(i, c->idxmsk);
-	}
-
-	/*
-	 * recompute actual bit weight for scheduling algorithm
-	 */
-	c->weight = hweight64(c->idxmsk64);
-
-	/*
-	 * if we return an empty mask, then switch
-	 * back to static empty constraint to avoid
-	 * the cost of freeing later on
-	 */
-	if (c->weight == 0)
-		c = &emptyconstraint;
-
-	return c;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			    struct perf_event *event)
-{
-	struct event_constraint *c1 = NULL;
-	struct event_constraint *c2;
-
-	if (idx >= 0) /* fake does < 0 */
-		c1 = cpuc->event_constraint[idx];
-
-	/*
-	 * first time only
-	 * - static constraint: no change across incremental scheduling calls
-	 * - dynamic constraint: handled by intel_get_excl_constraints()
-	 */
-	c2 = __intel_get_event_constraints(cpuc, idx, event);
-	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
-		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
-		c1->weight = c2->weight;
-		c2 = c1;
-	}
-
-	if (cpuc->excl_cntrs)
-		return intel_get_excl_constraints(cpuc, event, idx, c2);
-
-	return c2;
-}
-
-static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-		struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	int tid = cpuc->excl_thread_id;
-	struct intel_excl_states *xl;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake)
-		return;
-
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
-		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
-		if (!--cpuc->n_excl)
-			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
-	}
-
-	/*
-	 * If event was actually assigned, then mark the counter state as
-	 * unused now.
-	 */
-	if (hwc->idx >= 0) {
-		xl = &excl_cntrs->states[tid];
-
-		/*
-		 * put_constraint may be called from x86_schedule_events()
-		 * which already has the lock held so here make locking
-		 * conditional.
-		 */
-		if (!xl->sched_started)
-			raw_spin_lock(&excl_cntrs->lock);
-
-		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
-
-		if (!xl->sched_started)
-			raw_spin_unlock(&excl_cntrs->lock);
-	}
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
-					struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-
-	reg = &event->hw.extra_reg;
-	if (reg->idx != EXTRA_REG_NONE)
-		__intel_shared_reg_put_constraints(cpuc, reg);
-
-	reg = &event->hw.branch_reg;
-	if (reg->idx != EXTRA_REG_NONE)
-		__intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-					struct perf_event *event)
-{
-	intel_put_shared_regs_event_constraints(cpuc, event);
-
-	/*
-	 * is PMU has exclusive counter restrictions, then
-	 * all events are subject to and must call the
-	 * put_excl_constraints() routine
-	 */
-	if (cpuc->excl_cntrs)
-		intel_put_excl_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use INST_RETIRED.ANY_P
-		 * (0x00c0), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * INST_RETIRED.ANY_P counts the number of cycles that retires
-		 * CNTMASK instructions. By setting CNTMASK to a value (16)
-		 * larger than the maximum number of instructions that can be
-		 * retired per cycle (4) and then inverting the condition, we
-		 * count all cycles that retire 16 or less instructions, which
-		 * is every cycle.
-		 *
-		 * Thereby we gain a PEBS capable cycle counter.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use UOPS_RETIRED.ALL
-		 * (0x01c2), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * UOPS_RETIRED.ALL counts the number of cycles that retires
-		 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
-		 * larger than the maximum number of micro-ops that can be
-		 * retired per cycle (4) and then inverting the condition, we
-		 * count all cycles that retire 16 or less micro-ops, which
-		 * is every cycle.
-		 *
-		 * Thereby we gain a PEBS capable cycle counter.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_precdist(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
-		 * (0x01c0), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * The PREC_DIST event has special support to minimize sample
-		 * shadowing effects. One drawback is that it can be
-		 * only programmed on counter 1, but that seems like an
-		 * acceptable trade off.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_ivb(struct perf_event *event)
-{
-	if (event->attr.precise_ip < 3)
-		return intel_pebs_aliases_snb(event);
-	return intel_pebs_aliases_precdist(event);
-}
-
-static void intel_pebs_aliases_skl(struct perf_event *event)
-{
-	if (event->attr.precise_ip < 3)
-		return intel_pebs_aliases_core2(event);
-	return intel_pebs_aliases_precdist(event);
-}
-
-static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
-{
-	unsigned long flags = x86_pmu.free_running_flags;
-
-	if (event->attr.use_clockid)
-		flags &= ~PERF_SAMPLE_TIME;
-	return flags;
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
-	int ret = x86_pmu_hw_config(event);
-
-	if (ret)
-		return ret;
-
-	if (event->attr.precise_ip) {
-		if (!event->attr.freq) {
-			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-			if (!(event->attr.sample_type &
-			      ~intel_pmu_free_running_flags(event)))
-				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
-		}
-		if (x86_pmu.pebs_aliases)
-			x86_pmu.pebs_aliases(event);
-	}
-
-	if (needs_branch_stack(event)) {
-		ret = intel_pmu_setup_lbr_filter(event);
-		if (ret)
-			return ret;
-
-		/*
-		 * BTS is set up earlier in this path, so don't account twice
-		 */
-		if (!intel_pmu_has_bts(event)) {
-			/* disallow lbr if conflicting events are present */
-			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-				return -EBUSY;
-
-			event->destroy = hw_perf_lbr_event_destroy;
-		}
-	}
-
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
-
-	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
-		return 0;
-
-	if (x86_pmu.version < 3)
-		return -EINVAL;
-
-	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
-	return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-	if (x86_pmu.guest_get_msrs)
-		return x86_pmu.guest_get_msrs(nr);
-	*nr = 0;
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
-	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-	/*
-	 * If PMU counter has PEBS enabled it is not enough to disable counter
-	 * on a guest entry since PEBS memory write can overshoot guest entry
-	 * and corrupt guest memory. Disabling PEBS solves the problem.
-	 */
-	arr[1].msr = MSR_IA32_PEBS_ENABLE;
-	arr[1].host = cpuc->pebs_enabled;
-	arr[1].guest = 0;
-
-	*nr = 2;
-	return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
-		struct perf_event *event = cpuc->events[idx];
-
-		arr[idx].msr = x86_pmu_config_addr(idx);
-		arr[idx].host = arr[idx].guest = 0;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		arr[idx].host = arr[idx].guest =
-			event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
-		if (event->attr.exclude_host)
-			arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		else if (event->attr.exclude_guest)
-			arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-	}
-
-	*nr = x86_pmu.num_counters;
-	return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
-	if (!event->attr.exclude_host)
-		x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-		if (!test_bit(idx, cpuc->active_mask) ||
-				cpuc->events[idx]->attr.exclude_host)
-			continue;
-
-		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-	}
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
-	int ret = intel_pmu_hw_config(event);
-
-	if (ret)
-		return ret;
-	if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
-		return 0;
-	event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
-	/*
-	 * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
-	 * PEBS or in ANY thread mode. Since the results are non-sensical forbid
-	 * this combination.
-	 */
-	if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
-	     ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
-	      event->attr.precise_ip > 0))
-		return -EOPNOTSUPP;
-
-	if (event_is_checkpointed(event)) {
-		/*
-		 * Sampling of checkpointed events can cause situations where
-		 * the CPU constantly aborts because of a overflow, which is
-		 * then checkpointed back and ignored. Forbid checkpointing
-		 * for sampling.
-		 *
-		 * But still allow a long sampling period, so that perf stat
-		 * from KVM works.
-		 */
-		if (event->attr.sample_period > 0 &&
-		    event->attr.sample_period < 0x7fffffff)
-			return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static struct event_constraint counter2_constraint =
-			EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	c = intel_get_event_constraints(cpuc, idx, event);
-
-	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
-	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-		if (c->idxmsk64 & (1U << 2))
-			return &counter2_constraint;
-		return &emptyconstraint;
-	}
-
-	return c;
-}
-
-/*
- * Broadwell:
- *
- * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
- * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
- * the two to enforce a minimum period of 128 (the smallest value that has bits
- * 0-5 cleared and >= 100).
- *
- * Because of how the code in x86_perf_event_set_period() works, the truncation
- * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
- * to make up for the 'lost' events due to carrying the 'error' in period_left.
- *
- * Therefore the effective (average) period matches the requested period,
- * despite coarser hardware granularity.
- */
-static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
-{
-	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
-			X86_CONFIG(.event=0xc0, .umask=0x01)) {
-		if (left < 128)
-			left = 128;
-		left &= ~0x3fu;
-	}
-	return left;
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(pc,	"config:19"	);
-PMU_FORMAT_ATTR(any,	"config:21"	); /* v3 + */
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-PMU_FORMAT_ATTR(in_tx,  "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
-	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
-	return x86_event_sysfs_show(page, config, event);
-}
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-	struct intel_shared_regs *regs;
-	int i;
-
-	regs = kzalloc_node(sizeof(struct intel_shared_regs),
-			    GFP_KERNEL, cpu_to_node(cpu));
-	if (regs) {
-		/*
-		 * initialize the locks to keep lockdep happy
-		 */
-		for (i = 0; i < EXTRA_REG_MAX; i++)
-			raw_spin_lock_init(&regs->regs[i].lock);
-
-		regs->core_id = -1;
-	}
-	return regs;
-}
-
-static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
-{
-	struct intel_excl_cntrs *c;
-
-	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
-			 GFP_KERNEL, cpu_to_node(cpu));
-	if (c) {
-		raw_spin_lock_init(&c->lock);
-		c->core_id = -1;
-	}
-	return c;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
-		cpuc->shared_regs = allocate_shared_regs(cpu);
-		if (!cpuc->shared_regs)
-			goto err;
-	}
-
-	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
-
-		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
-		if (!cpuc->constraint_list)
-			goto err_shared_regs;
-
-		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-		if (!cpuc->excl_cntrs)
-			goto err_constraint_list;
-
-		cpuc->excl_thread_id = 0;
-	}
-
-	return NOTIFY_OK;
-
-err_constraint_list:
-	kfree(cpuc->constraint_list);
-	cpuc->constraint_list = NULL;
-
-err_shared_regs:
-	kfree(cpuc->shared_regs);
-	cpuc->shared_regs = NULL;
-
-err:
-	return NOTIFY_BAD;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int core_id = topology_core_id(cpu);
-	int i;
-
-	init_debug_store_on_cpu(cpu);
-	/*
-	 * Deal with CPUs that don't clear their LBRs on power-up.
-	 */
-	intel_pmu_lbr_reset();
-
-	cpuc->lbr_sel = NULL;
-
-	if (!cpuc->shared_regs)
-		return;
-
-	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
-		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-			struct intel_shared_regs *pc;
-
-			pc = per_cpu(cpu_hw_events, i).shared_regs;
-			if (pc && pc->core_id == core_id) {
-				cpuc->kfree_on_online[0] = cpuc->shared_regs;
-				cpuc->shared_regs = pc;
-				break;
-			}
-		}
-		cpuc->shared_regs->core_id = core_id;
-		cpuc->shared_regs->refcnt++;
-	}
-
-	if (x86_pmu.lbr_sel_map)
-		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-
-	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-			struct intel_excl_cntrs *c;
-
-			c = per_cpu(cpu_hw_events, i).excl_cntrs;
-			if (c && c->core_id == core_id) {
-				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
-				cpuc->excl_cntrs = c;
-				cpuc->excl_thread_id = 1;
-				break;
-			}
-		}
-		cpuc->excl_cntrs->core_id = core_id;
-		cpuc->excl_cntrs->refcnt++;
-	}
-}
-
-static void free_excl_cntrs(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_excl_cntrs *c;
-
-	c = cpuc->excl_cntrs;
-	if (c) {
-		if (c->core_id == -1 || --c->refcnt == 0)
-			kfree(c);
-		cpuc->excl_cntrs = NULL;
-		kfree(cpuc->constraint_list);
-		cpuc->constraint_list = NULL;
-	}
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_shared_regs *pc;
-
-	pc = cpuc->shared_regs;
-	if (pc) {
-		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
-		cpuc->shared_regs = NULL;
-	}
-
-	free_excl_cntrs(cpu);
-
-	fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
-				 bool sched_in)
-{
-	if (x86_pmu.pebs_active)
-		intel_pmu_pebs_sched_task(ctx, sched_in);
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_sched_task(ctx, sched_in);
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-PMU_FORMAT_ATTR(frontend, "config1:0-23");
-
-static struct attribute *intel_arch3_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_any.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	&format_attr_in_tx.attr,
-	&format_attr_in_tx_cp.attr,
-
-	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-	&format_attr_ldlat.attr, /* PEBS load latency */
-	NULL,
-};
-
-static struct attribute *skl_format_attr[] = {
-	&format_attr_frontend.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu core_pmu = {
-	.name			= "core",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= core_pmu_enable_all,
-	.enable			= core_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
-
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32-bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic event period:
-	 */
-	.max_period		= (1ULL<<31) - 1,
-	.get_event_constraints	= intel_get_event_constraints,
-	.put_event_constraints	= intel_put_event_constraints,
-	.event_constraints	= intel_core_event_constraints,
-	.guest_get_msrs		= core_guest_get_msrs,
-	.format_attrs		= intel_arch_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-	/*
-	 * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
-	 * together with PMU version 1 and thus be using core_pmu with
-	 * shared_regs. We need following callbacks here to allocate
-	 * it properly.
-	 */
-	.cpu_prepare		= intel_pmu_cpu_prepare,
-	.cpu_starting		= intel_pmu_cpu_starting,
-	.cpu_dying		= intel_pmu_cpu_dying,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
-	.name			= "Intel",
-	.handle_irq		= intel_pmu_handle_irq,
-	.disable_all		= intel_pmu_disable_all,
-	.enable_all		= intel_pmu_enable_all,
-	.enable			= intel_pmu_enable_event,
-	.disable		= intel_pmu_disable_event,
-	.hw_config		= intel_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic event period:
-	 */
-	.max_period		= (1ULL << 31) - 1,
-	.get_event_constraints	= intel_get_event_constraints,
-	.put_event_constraints	= intel_put_event_constraints,
-	.pebs_aliases		= intel_pebs_aliases_core2,
-
-	.format_attrs		= intel_arch3_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-	.cpu_prepare		= intel_pmu_cpu_prepare,
-	.cpu_starting		= intel_pmu_cpu_starting,
-	.cpu_dying		= intel_pmu_cpu_dying,
-	.guest_get_msrs		= intel_guest_get_msrs,
-	.sched_task		= intel_pmu_sched_task,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
-	/*
-	 * PEBS is unreliable due to:
-	 *
-	 *   AJ67  - PEBS may experience CPL leaks
-	 *   AJ68  - PEBS PMI may be delayed by one event
-	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
-	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
-	 *
-	 * AJ67 could be worked around by restricting the OS/USR flags.
-	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
-	 *
-	 * AJ106 could possibly be worked around by not allowing LBR
-	 *       usage from PEBS, including the fixup.
-	 * AJ68  could possibly be worked around by always programming
-	 *	 a pebs_event_reset[0] value and coping with the lost events.
-	 *
-	 * But taken together it might just make sense to not enable PEBS on
-	 * these chips.
-	 */
-	pr_warn("PEBS disabled due to CPU errata\n");
-	x86_pmu.pebs = 0;
-	x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
-	u32 rev = UINT_MAX; /* default to broken for unknown models */
-
-	switch (cpu_data(cpu).x86_model) {
-	case 42: /* SNB */
-		rev = 0x28;
-		break;
-
-	case 45: /* SNB-EP */
-		switch (cpu_data(cpu).x86_mask) {
-		case 6: rev = 0x618; break;
-		case 7: rev = 0x70c; break;
-		}
-	}
-
-	return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
-	int pebs_broken = 0;
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
-			break;
-	}
-	put_online_cpus();
-
-	if (pebs_broken == x86_pmu.pebs_broken)
-		return;
-
-	/*
-	 * Serialized by the microcode lock..
-	 */
-	if (x86_pmu.pebs_broken) {
-		pr_info("PEBS enabled due to microcode update\n");
-		x86_pmu.pebs_broken = 0;
-	} else {
-		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
-		x86_pmu.pebs_broken = 1;
-	}
-}
-
-/*
- * Under certain circumstances, access certain MSR may cause #GP.
- * The function tests if the input MSR can be safely accessed.
- */
-static bool check_msr(unsigned long msr, u64 mask)
-{
-	u64 val_old, val_new, val_tmp;
-
-	/*
-	 * Read the current value, change it and read it back to see if it
-	 * matches, this is needed to detect certain hardware emulators
-	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-	 */
-	if (rdmsrl_safe(msr, &val_old))
-		return false;
-
-	/*
-	 * Only change the bits which can be updated by wrmsrl.
-	 */
-	val_tmp = val_old ^ mask;
-	if (wrmsrl_safe(msr, val_tmp) ||
-	    rdmsrl_safe(msr, &val_new))
-		return false;
-
-	if (val_new != val_tmp)
-		return false;
-
-	/* Here it's sure that the MSR can be safely accessed.
-	 * Restore the old value and return.
-	 */
-	wrmsrl(msr, val_old);
-
-	return true;
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
-	x86_pmu.check_microcode = intel_snb_check_microcode;
-	intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
-	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
-	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
-	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
-	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
-	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
-	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
-	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
-	int bit;
-
-	/* disable event that reported as not presend by cpuid */
-	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-		pr_warn("CPUID marked event: \'%s\' unavailable\n",
-			intel_arch_events_map[bit].name);
-	}
-}
-
-static __init void intel_nehalem_quirk(void)
-{
-	union cpuid10_ebx ebx;
-
-	ebx.full = x86_pmu.events_maskl;
-	if (ebx.split.no_branch_misses_retired) {
-		/*
-		 * Erratum AAJ80 detected, we work it around by using
-		 * the BR_MISP_EXEC.ANY event. This will over-count
-		 * branch-misses, but it's still much better than the
-		 * architectural event which is often completely bogus:
-		 */
-		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
-		ebx.split.no_branch_misses_retired = 0;
-		x86_pmu.events_maskl = ebx.full;
-		pr_info("CPU erratum AAJ80 worked around\n");
-	}
-}
-
-/*
- * enable software workaround for errata:
- * SNB: BJ122
- * IVB: BV98
- * HSW: HSD29
- *
- * Only needed when HT is enabled. However detecting
- * if HT is enabled is difficult (model specific). So instead,
- * we enable the workaround in the early boot, and verify if
- * it is needed in a later initcall phase once we have valid
- * topology information to check if HT is actually enabled
- */
-static __init void intel_ht_bug(void)
-{
-	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
-
-	x86_pmu.start_scheduling = intel_start_scheduling;
-	x86_pmu.commit_scheduling = intel_commit_scheduling;
-	x86_pmu.stop_scheduling = intel_stop_scheduling;
-}
-
-EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
-
-/* Haswell special events */
-EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1");
-EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2");
-EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4");
-EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2");
-EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1");
-EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1");
-EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2");
-EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4");
-EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2");
-EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1");
-EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1");
-EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");
-
-static struct attribute *hsw_events_attrs[] = {
-	EVENT_PTR(tx_start),
-	EVENT_PTR(tx_commit),
-	EVENT_PTR(tx_abort),
-	EVENT_PTR(tx_capacity),
-	EVENT_PTR(tx_conflict),
-	EVENT_PTR(el_start),
-	EVENT_PTR(el_commit),
-	EVENT_PTR(el_abort),
-	EVENT_PTR(el_capacity),
-	EVENT_PTR(el_conflict),
-	EVENT_PTR(cycles_t),
-	EVENT_PTR(cycles_ct),
-	EVENT_PTR(mem_ld_hsw),
-	EVENT_PTR(mem_st_hsw),
-	NULL
-};
-
-__init int intel_pmu_init(void)
-{
-	union cpuid10_edx edx;
-	union cpuid10_eax eax;
-	union cpuid10_ebx ebx;
-	struct event_constraint *c;
-	unsigned int unused;
-	struct extra_reg *er;
-	int version, i;
-
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		switch (boot_cpu_data.x86) {
-		case 0x6:
-			return p6_pmu_init();
-		case 0xb:
-			return knc_pmu_init();
-		case 0xf:
-			return p4_pmu_init();
-		}
-		return -ENODEV;
-	}
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Branch Misses Retired hw_event or not.
-	 */
-	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
-	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
-		return -ENODEV;
-
-	version = eax.split.version_id;
-	if (version < 2)
-		x86_pmu = core_pmu;
-	else
-		x86_pmu = intel_pmu;
-
-	x86_pmu.version			= version;
-	x86_pmu.num_counters		= eax.split.num_counters;
-	x86_pmu.cntval_bits		= eax.split.bit_width;
-	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
-
-	x86_pmu.events_maskl		= ebx.full;
-	x86_pmu.events_mask_len		= eax.split.mask_length;
-
-	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
-	/*
-	 * Quirk: v2 perfmon does not report fixed-purpose events, so
-	 * assume at least 3 events:
-	 */
-	if (version > 1)
-		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	if (boot_cpu_has(X86_FEATURE_PDCM)) {
-		u64 capabilities;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-		x86_pmu.intel_cap.capabilities = capabilities;
-	}
-
-	intel_ds_init();
-
-	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
-	/*
-	 * Install the hw-cache-events table:
-	 */
-	switch (boot_cpu_data.x86_model) {
-	case 14: /* 65nm Core "Yonah" */
-		pr_cont("Core events, ");
-		break;
-
-	case 15: /* 65nm Core2 "Merom"          */
-		x86_add_quirk(intel_clovertown_quirk);
-	case 22: /* 65nm Core2 "Merom-L"        */
-	case 23: /* 45nm Core2 "Penryn"         */
-	case 29: /* 45nm Core2 "Dunnington (MP) */
-		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		intel_pmu_lbr_init_core();
-
-		x86_pmu.event_constraints = intel_core2_event_constraints;
-		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
-		pr_cont("Core2 events, ");
-		break;
-
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_nhm();
-
-		x86_pmu.event_constraints = intel_nehalem_event_constraints;
-		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-		x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
-		x86_pmu.cpu_events = nhm_events_attrs;
-
-		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-		x86_add_quirk(intel_nehalem_quirk);
-
-		pr_cont("Nehalem events, ");
-		break;
-
-	case 28: /* 45nm Atom "Pineview"   */
-	case 38: /* 45nm Atom "Lincroft"   */
-	case 39: /* 32nm Atom "Penwell"    */
-	case 53: /* 32nm Atom "Cloverview" */
-	case 54: /* 32nm Atom "Cedarview"  */
-		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		intel_pmu_lbr_init_atom();
-
-		x86_pmu.event_constraints = intel_gen_event_constraints;
-		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
-		pr_cont("Atom events, ");
-		break;
-
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 76: /* 14nm Atom "Airmont"                   */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
-			sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_atom();
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_slm_extra_regs;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		pr_cont("Silvermont events, ");
-		break;
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_nhm();
-
-		x86_pmu.event_constraints = intel_westmere_event_constraints;
-		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_westmere_extra_regs;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-
-		x86_pmu.cpu_events = nhm_events_attrs;
-
-		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-		pr_cont("Westmere events, ");
-		break;
-
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-		x86_add_quirk(intel_sandybridge_quirk);
-		x86_add_quirk(intel_ht_bug);
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_snb();
-
-		x86_pmu.event_constraints = intel_snb_event_constraints;
-		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-		if (boot_cpu_data.x86_model == 45)
-			x86_pmu.extra_regs = intel_snbep_extra_regs;
-		else
-			x86_pmu.extra_regs = intel_snb_extra_regs;
-
-
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.cpu_events = snb_events_attrs;
-
-		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
-		pr_cont("SandyBridge events, ");
-		break;
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-		x86_add_quirk(intel_ht_bug);
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		/* dTLB-load-misses on IVB is different than SNB */
-		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
-
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_snb();
-
-		x86_pmu.event_constraints = intel_ivb_event_constraints;
-		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		if (boot_cpu_data.x86_model == 62)
-			x86_pmu.extra_regs = intel_snbep_extra_regs;
-		else
-			x86_pmu.extra_regs = intel_snb_extra_regs;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.cpu_events = snb_events_attrs;
-
-		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
-		pr_cont("IvyBridge events, ");
-		break;
-
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-		x86_add_quirk(intel_ht_bug);
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_hsw();
-
-		x86_pmu.event_constraints = intel_hsw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.cpu_events = hsw_events_attrs;
-		x86_pmu.lbr_double_abort = true;
-		pr_cont("Haswell events, ");
-		break;
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
-		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
-									 BDW_L3_MISS|HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
-									  HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
-									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
-									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-
-		intel_pmu_lbr_init_hsw();
-
-		x86_pmu.event_constraints = intel_bdw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.cpu_events = hsw_events_attrs;
-		x86_pmu.limit_period = bdw_limit_period;
-		pr_cont("Broadwell events, ");
-		break;
-
-	case 87: /* Knights Landing Xeon Phi */
-		memcpy(hw_cache_event_ids,
-		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs,
-		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-		intel_pmu_lbr_init_knl();
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_knl_extra_regs;
-
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		pr_cont("Knights Landing events, ");
-		break;
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-		intel_pmu_lbr_init_skl();
-
-		x86_pmu.event_constraints = intel_skl_event_constraints;
-		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_skl_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-						  skl_format_attr);
-		WARN_ON(!x86_pmu.format_attrs);
-		x86_pmu.cpu_events = hsw_events_attrs;
-		pr_cont("Skylake events, ");
-		break;
-
-	default:
-		switch (x86_pmu.version) {
-		case 1:
-			x86_pmu.event_constraints = intel_v1_event_constraints;
-			pr_cont("generic architected perfmon v1, ");
-			break;
-		default:
-			/*
-			 * default constraints for v2 and up
-			 */
-			x86_pmu.event_constraints = intel_gen_event_constraints;
-			pr_cont("generic architected perfmon, ");
-			break;
-		}
-	}
-
-	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-	}
-
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
-	if (x86_pmu.event_constraints) {
-		/*
-		 * event on fixed counter2 (REF_CYCLES) only works on this
-		 * counter, so do not extend mask to generic counters
-		 */
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask == FIXED_EVENT_FLAGS
-			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-			}
-			c->idxmsk64 &=
-				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
-			c->weight = hweight64(c->idxmsk64);
-		}
-	}
-
-	/*
-	 * Access LBR MSR may cause #GP under certain circumstances.
-	 * E.g. KVM doesn't support LBR MSR
-	 * Check all LBT MSR here.
-	 * Disable LBR access if any LBR MSRs can not be accessed.
-	 */
-	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
-		x86_pmu.lbr_nr = 0;
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-		      check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
-			x86_pmu.lbr_nr = 0;
-	}
-
-	/*
-	 * Access extra MSR may cause #GP under certain circumstances.
-	 * E.g. KVM doesn't support offcore event
-	 * Check all extra_regs here.
-	 */
-	if (x86_pmu.extra_regs) {
-		for (er = x86_pmu.extra_regs; er->msr; er++) {
-			er->extra_msr_access = check_msr(er->msr, 0x11UL);
-			/* Disable LBR select mapping */
-			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-				x86_pmu.lbr_sel_map = NULL;
-		}
-	}
-
-	/* Support full width counters using alternative MSR range */
-	if (x86_pmu.intel_cap.full_width_write) {
-		x86_pmu.max_period = x86_pmu.cntval_mask;
-		x86_pmu.perfctr = MSR_IA32_PMC0;
-		pr_cont("full-width counters, ");
-	}
-
-	return 0;
-}
-
-/*
- * HT bug: phase 2 init
- * Called once we have valid topology information to check
- * whether or not HT is enabled
- * If HT is off, then we disable the workaround
- */
-static __init int fixup_ht_bug(void)
-{
-	int cpu = smp_processor_id();
-	int w, c;
-	/*
-	 * problem not present on this CPU model, nothing to do
-	 */
-	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
-		return 0;
-
-	w = cpumask_weight(topology_sibling_cpumask(cpu));
-	if (w > 1) {
-		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
-		return 0;
-	}
-
-	if (lockup_detector_suspend() != 0) {
-		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-		return 0;
-	}
-
-	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
-
-	x86_pmu.start_scheduling = NULL;
-	x86_pmu.commit_scheduling = NULL;
-	x86_pmu.stop_scheduling = NULL;
-
-	lockup_detector_resume();
-
-	get_online_cpus();
-
-	for_each_online_cpu(c) {
-		free_excl_cntrs(c);
-	}
-
-	put_online_cpus();
-	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
-	return 0;
-}
-subsys_initcall(fixup_ht_bug)
-- 
cgit 


From 5c781a3daa3053c355259894f9e7a478deb0cb46 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:09 +0100
Subject: perf/x86: Move perf_event_intel_cqm.c ........ =>
 x86/events/intel/cqm.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                   |    2 +-
 arch/x86/events/intel/cqm.c                | 1391 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile               |    2 +-
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 1391 ----------------------------
 4 files changed, 1393 insertions(+), 1393 deletions(-)
 create mode 100644 arch/x86/events/intel/cqm.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 834e9ae507f9..f97c283ec7a2 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
new file mode 100644
index 000000000000..d1b623a15ac0
--- /dev/null
+++ b/arch/x86/events/intel/cqm.c
@@ -0,0 +1,1391 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
+};
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
+
+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static u32 intel_cqm_rotation_rmid;
+
+#define INVALID_RMID		(-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(u32 rmid)
+{
+	if (!rmid || rmid == INVALID_RMID)
+		return false;
+
+	return true;
+}
+
+static u64 __rmid_read(u32 rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+enum rmid_recycle_state {
+	RMID_YOUNG = 0,
+	RMID_AVAILABLE,
+	RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+	u32 rmid;
+	enum rmid_recycle_state state;
+	struct list_head list;
+	unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	entry = cqm_rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static u32 __get_rmid(void)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	if (list_empty(&cqm_rmid_free_lru))
+		return INVALID_RMID;
+
+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
+}
+
+static void __put_rmid(u32 rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	WARN_ON(!__rmid_valid(rmid));
+	entry = __rmid_entry(rmid);
+
+	entry->queue_time = jiffies;
+	entry->state = RMID_YOUNG;
+
+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+	struct cqm_rmid_entry *entry;
+	unsigned int nr_rmids;
+	int r = 0;
+
+	nr_rmids = cqm_max_rmid + 1;
+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+				nr_rmids, GFP_KERNEL);
+	if (!cqm_rmid_ptrs)
+		return -ENOMEM;
+
+	for (; r <= cqm_max_rmid; r++) {
+		struct cqm_rmid_entry *entry;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto fail;
+
+		INIT_LIST_HEAD(&entry->list);
+		entry->rmid = r;
+		cqm_rmid_ptrs[r] = entry;
+
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
+
+	mutex_lock(&cache_mutex);
+	intel_cqm_rotation_rmid = __get_rmid();
+	mutex_unlock(&cache_mutex);
+
+	return 0;
+fail:
+	while (r--)
+		kfree(cqm_rmid_ptrs[r]);
+
+	kfree(cqm_rmid_ptrs);
+	return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	/* Per-cpu and task events don't mix */
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.target == b->hw.target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.target, event->ctx);
+
+	return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+struct rmid_read {
+	u32 rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
+{
+	struct perf_event *event;
+	struct list_head *head = &group->hw.cqm_group_entry;
+	u32 old_rmid = group->hw.cqm_rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	/*
+	 * If our RMID is being deallocated, perform a read now.
+	 */
+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+		struct rmid_read rr = {
+			.value = ATOMIC64_INIT(0),
+			.rmid = old_rmid,
+		};
+
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+				 &rr, 1);
+		local64_set(&group->count, atomic64_read(&rr.value));
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+
+	group->hw.cqm_rmid = rmid;
+	list_for_each_entry(event, head, hw.cqm_group_entry)
+		event->hw.cqm_rmid = rmid;
+
+	raw_spin_unlock_irq(&cache_lock);
+
+	return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+	struct cqm_rmid_entry *entry;
+
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		if (entry->state != RMID_AVAILABLE)
+			break;
+
+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+			entry->state = RMID_DIRTY;
+	}
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(u32 rmid)
+{
+	struct perf_event *leader, *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	leader = list_first_entry(&cache_groups, struct perf_event,
+				  hw.cqm_groups_entry);
+	event = leader;
+
+	list_for_each_entry_continue(event, &cache_groups,
+				     hw.cqm_groups_entry) {
+		if (__rmid_valid(event->hw.cqm_rmid))
+			continue;
+
+		if (__conflict_event(event, leader))
+			continue;
+
+		intel_cqm_xchg_rmid(event, rmid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+	struct cqm_rmid_entry *entry, *tmp;
+
+	lockdep_assert_held(&cache_mutex);
+
+	*available = 0;
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		unsigned long min_queue_time;
+		unsigned long now = jiffies;
+
+		/*
+		 * We hold RMIDs placed into limbo for a minimum queue
+		 * time. Before the minimum queue time has elapsed we do
+		 * not recycle RMIDs.
+		 *
+		 * The reasoning is that until a sufficient time has
+		 * passed since we stopped using an RMID, any RMID
+		 * placed onto the limbo list will likely still have
+		 * data tagged in the cache, which means we'll probably
+		 * fail to recycle it anyway.
+		 *
+		 * We can save ourselves an expensive IPI by skipping
+		 * any RMIDs that have not been queued for the minimum
+		 * time.
+		 */
+		min_queue_time = entry->queue_time +
+			msecs_to_jiffies(__rmid_queue_time_ms);
+
+		if (time_after(min_queue_time, now))
+			break;
+
+		entry->state = RMID_AVAILABLE;
+		(*available)++;
+	}
+
+	/*
+	 * Fast return if none of the RMIDs on the limbo list have been
+	 * sitting on the queue for the minimum queue time.
+	 */
+	if (!*available)
+		return false;
+
+	/*
+	 * Test whether an RMID is free for each package.
+	 */
+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+		/*
+		 * Exhausted all RMIDs that have waited min queue time.
+		 */
+		if (entry->state == RMID_YOUNG)
+			break;
+
+		if (entry->state == RMID_DIRTY)
+			continue;
+
+		list_del(&entry->list);	/* remove from limbo */
+
+		/*
+		 * The rotation RMID gets priority if it's
+		 * currently invalid. In which case, skip adding
+		 * the RMID to the the free lru.
+		 */
+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+			intel_cqm_rotation_rmid = entry->rmid;
+			continue;
+		}
+
+		/*
+		 * If we have groups waiting for RMIDs, hand
+		 * them one now provided they don't conflict.
+		 */
+		if (intel_cqm_sched_in_event(entry->rmid))
+			continue;
+
+		/*
+		 * Otherwise place it onto the free list.
+		 */
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+
+	return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+	struct perf_event *rotor;
+	u32 rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	rotor = list_first_entry(&cache_groups, struct perf_event,
+				 hw.cqm_groups_entry);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
+	 * don't need to rotate the list.
+	 */
+	if (next == rotor)
+		return;
+
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+	__put_rmid(rmid);
+
+	list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+	struct perf_event *group, *g;
+	u32 rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+		if (group == event)
+			continue;
+
+		rmid = group->hw.cqm_rmid;
+
+		/*
+		 * Skip events that don't have a valid RMID.
+		 */
+		if (!__rmid_valid(rmid))
+			continue;
+
+		/*
+		 * No conflict? No problem! Leave the event alone.
+		 */
+		if (!__conflict_event(group, event))
+			continue;
+
+		intel_cqm_xchg_rmid(group, INVALID_RMID);
+		__put_rmid(rmid);
+	}
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+	struct perf_event *group, *start = NULL;
+	unsigned int threshold_limit;
+	unsigned int nr_needed = 0;
+	unsigned int nr_available;
+	bool rotated = false;
+
+	mutex_lock(&cache_mutex);
+
+again:
+	/*
+	 * Fast path through this function if there are no groups and no
+	 * RMIDs that need cleaning.
+	 */
+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+		if (!__rmid_valid(group->hw.cqm_rmid)) {
+			if (!start)
+				start = group;
+			nr_needed++;
+		}
+	}
+
+	/*
+	 * We have some event groups, but they all have RMIDs assigned
+	 * and no RMIDs need cleaning.
+	 */
+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	if (!nr_needed)
+		goto stabilize;
+
+	/*
+	 * We have more event groups without RMIDs than available RMIDs,
+	 * or we have event groups that conflict with the ones currently
+	 * scheduled.
+	 *
+	 * We force deallocate the rmid of the group at the head of
+	 * cache_groups. The first event group without an RMID then gets
+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
+	 * forward progress.
+	 *
+	 * Rotate the cache_groups list so the previous head is now the
+	 * tail.
+	 */
+	__intel_cqm_pick_and_rotate(start);
+
+	/*
+	 * If the rotation is going to succeed, reduce the threshold so
+	 * that we don't needlessly reuse dirty RMIDs.
+	 */
+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+		intel_cqm_rotation_rmid = __get_rmid();
+
+		intel_cqm_sched_out_conflicting_events(start);
+
+		if (__intel_cqm_threshold)
+			__intel_cqm_threshold--;
+	}
+
+	rotated = true;
+
+stabilize:
+	/*
+	 * We now need to stablize the RMID we freed above (if any) to
+	 * ensure that the next time we rotate we have an RMID with zero
+	 * occupancy value.
+	 *
+	 * Alternatively, if we didn't need to perform any rotation,
+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
+	 */
+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+	while (intel_cqm_rmid_stabilize(&nr_available) &&
+	       __intel_cqm_threshold < threshold_limit) {
+		unsigned int steal_limit;
+
+		/*
+		 * Don't spin if nobody is actively waiting for an RMID,
+		 * the rotation worker will be kicked as soon as an
+		 * event needs an RMID anyway.
+		 */
+		if (!nr_needed)
+			break;
+
+		/* Allow max 25% of RMIDs to be in limbo. */
+		steal_limit = (cqm_max_rmid + 1) / 4;
+
+		/*
+		 * We failed to stabilize any RMIDs so our rotation
+		 * logic is now stuck. In order to make forward progress
+		 * we have a few options:
+		 *
+		 *   1. rotate ("steal") another RMID
+		 *   2. increase the threshold
+		 *   3. do nothing
+		 *
+		 * We do both of 1. and 2. until we hit the steal limit.
+		 *
+		 * The steal limit prevents all RMIDs ending up on the
+		 * limbo list. This can happen if every RMID has a
+		 * non-zero occupancy above threshold_limit, and the
+		 * occupancy values aren't dropping fast enough.
+		 *
+		 * Note that there is prioritisation at work here - we'd
+		 * rather increase the number of RMIDs on the limbo list
+		 * than increase the threshold, because increasing the
+		 * threshold skews the event data (because we reuse
+		 * dirty RMIDs) - threshold bumps are a last resort.
+		 */
+		if (nr_available < steal_limit)
+			goto again;
+
+		__intel_cqm_threshold++;
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+	unsigned long delay;
+
+	__intel_cqm_rmid_rotate();
+
+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+				  struct perf_event **group)
+{
+	struct perf_event *iter;
+	bool conflict = false;
+	u32 rmid;
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		rmid = iter->hw.cqm_rmid;
+
+		if (__match_event(iter, event)) {
+			/* All tasks in a group share an RMID */
+			event->hw.cqm_rmid = rmid;
+			*group = iter;
+			return;
+		}
+
+		/*
+		 * We only care about conflicts for events that are
+		 * actually scheduled in (and hence have a valid RMID).
+		 */
+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
+			conflict = true;
+	}
+
+	if (conflict)
+		rmid = INVALID_RMID;
+	else
+		rmid = __get_rmid();
+
+	event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+	unsigned long flags;
+	u32 rmid;
+	u64 val;
+
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	rmid = event->hw.cqm_rmid;
+
+	if (!__rmid_valid(rmid))
+		goto out;
+
+	val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		goto out;
+
+	local64_set(&event->count, val);
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	unsigned long flags;
+	struct rmid_read rr = {
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	/*
+	 * Getting up-to-date values requires an SMP IPI which is not
+	 * possible if we're being called in interrupt context. Return
+	 * the cached values instead.
+	 */
+	if (unlikely(in_interrupt()))
+		goto out;
+
+	/*
+	 * Notice that we don't perform the reading of an RMID
+	 * atomically, because we can't hold a spin lock across the
+	 * IPIs.
+	 *
+	 * Speculatively perform the read, since @event might be
+	 * assigned a different (possibly invalid) RMID while we're
+	 * busying performing the IPI calls. It's therefore necessary to
+	 * check @event's RMID afterwards, and if it has changed,
+	 * discard the result of the read.
+	 */
+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+	if (!__rmid_valid(rr.rmid))
+		goto out;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	if (event->hw.cqm_rmid == rr.rmid)
+		local64_set(&event->count, atomic64_read(&rr.value));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = event->hw.cqm_rmid;
+
+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+	if (state->rmid_usecnt++) {
+		if (!WARN_ON_ONCE(state->rmid != rmid))
+			return;
+	} else {
+		WARN_ON_ONCE(state->rmid);
+	}
+
+	state->rmid = rmid;
+	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+	if (event->hw.cqm_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cqm_state |= PERF_HES_STOPPED;
+
+	intel_cqm_event_read(event);
+
+	if (!--state->rmid_usecnt) {
+		state->rmid = 0;
+		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	}
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	unsigned long flags;
+	u32 rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	event->hw.cqm_state = PERF_HES_STOPPED;
+	rmid = event->hw.cqm_rmid;
+
+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+		intel_cqm_event_start(event, mode);
+
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_group_entry,
+					       struct perf_event,
+					       hw.cqm_group_entry);
+		list_del(&event->hw.cqm_group_entry);
+	}
+
+	/*
+	 * And we're the group leader..
+	 */
+	if (cqm_group_leader(event)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cqm_groups_entry,
+				     &group_other->hw.cqm_groups_entry);
+		} else {
+			u32 rmid = event->hw.cqm_rmid;
+
+			if (__rmid_valid(rmid))
+				__put_rmid(rmid);
+			list_del(&event->hw.cqm_groups_entry);
+		}
+	}
+
+	mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	bool rotate = false;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	/* Will also set rmid */
+	intel_cqm_setup_event(event, &group);
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_group_entry,
+			      &group->hw.cqm_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_groups_entry,
+			      &cache_groups);
+
+		/*
+		 * All RMIDs are either in use or have recently been
+		 * used. Kick the rotation worker to clean/free some.
+		 *
+		 * We only do this for the group leader, rather than for
+		 * every event in a group to save on needless work.
+		 */
+		if (!__rmid_valid(event->hw.cqm_rmid))
+			rotate = true;
+	}
+
+	mutex_unlock(&cache_mutex);
+
+	if (rotate)
+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+	return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+			   char *page)
+{
+	ssize_t rv;
+
+	mutex_lock(&cache_mutex);
+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+	mutex_unlock(&cache_mutex);
+
+	return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int bytes, cachelines;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cache_mutex);
+
+	__intel_cqm_max_threshold = bytes;
+	cachelines = bytes / cqm_l3_scale;
+
+	/*
+	 * The new maximum takes effect immediately.
+	 */
+	if (__intel_cqm_threshold > cachelines)
+		__intel_cqm_threshold = cachelines;
+
+	mutex_unlock(&cache_mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+	&dev_attr_max_recycle_threshold.attr,
+	NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+	.attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	&intel_cqm_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_stop,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+	.count		     = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	for_each_cpu(i, &cqm_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;	/* already got reader for this socket */
+	}
+
+	cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	state->rmid = 0;
+	state->closid = 0;
+	state->rmid_usecnt = 0;
+
+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+		return;
+
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+
+		if (phys_id == topology_physical_package_id(i)) {
+			cpumask_set_cpu(i, &cqm_cpumask);
+			break;
+		}
+	}
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		intel_cqm_cpu_starting(cpu);
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->x86_cache_max_rmid < cqm_max_rmid)
+			cqm_max_rmid = c->x86_cache_max_rmid;
+
+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
+			pr_err("Multiple LLC scale values, disabling\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	ret = intel_cqm_setup_rmid_cache();
+	if (ret)
+		goto out;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_starting(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+	if (ret)
+		pr_err("Intel CQM perf registration failed: %d\n", ret);
+	else
+		pr_info("Intel CQM monitoring enabled\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0a27243cb6c3..64781cfcbb2b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
deleted file mode 100644
index a316ca96f1b6..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define MSR_IA32_PQR_ASSOC	0x0c8f
-#define MSR_IA32_QM_CTR		0x0c8e
-#define MSR_IA32_QM_EVTSEL	0x0c8d
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
-	int			rmid_usecnt;
-};
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR		(1ULL << 63)
-#define RMID_VAL_UNAVAIL	(1ULL << 62)
-
-#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
-
-#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID		(-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-	if (!rmid || rmid == INVALID_RMID)
-		return false;
-
-	return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-	u64 val;
-
-	/*
-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-	 * it just says that to increase confusion.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-
-	/*
-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-	 * the number of cachelines tagged with @rmid.
-	 */
-	return val;
-}
-
-enum rmid_recycle_state {
-	RMID_YOUNG = 0,
-	RMID_AVAILABLE,
-	RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-	u32 rmid;
-	enum rmid_recycle_state state;
-	struct list_head list;
-	unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	entry = cqm_rmid_ptrs[rmid];
-	WARN_ON(entry->rmid != rmid);
-
-	return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	if (list_empty(&cqm_rmid_free_lru))
-		return INVALID_RMID;
-
-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-	list_del(&entry->list);
-
-	return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	WARN_ON(!__rmid_valid(rmid));
-	entry = __rmid_entry(rmid);
-
-	entry->queue_time = jiffies;
-	entry->state = RMID_YOUNG;
-
-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-	struct cqm_rmid_entry *entry;
-	unsigned int nr_rmids;
-	int r = 0;
-
-	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-				nr_rmids, GFP_KERNEL);
-	if (!cqm_rmid_ptrs)
-		return -ENOMEM;
-
-	for (; r <= cqm_max_rmid; r++) {
-		struct cqm_rmid_entry *entry;
-
-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto fail;
-
-		INIT_LIST_HEAD(&entry->list);
-		entry->rmid = r;
-		cqm_rmid_ptrs[r] = entry;
-
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-	/*
-	 * RMID 0 is special and is always allocated. It's used for all
-	 * tasks that are not monitored.
-	 */
-	entry = __rmid_entry(0);
-	list_del(&entry->list);
-
-	mutex_lock(&cache_mutex);
-	intel_cqm_rotation_rmid = __get_rmid();
-	mutex_unlock(&cache_mutex);
-
-	return 0;
-fail:
-	while (r--)
-		kfree(cqm_rmid_ptrs[r]);
-
-	kfree(cqm_rmid_ptrs);
-	return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-	/* Per-cpu and task events don't mix */
-	if ((a->attach_state & PERF_ATTACH_TASK) !=
-	    (b->attach_state & PERF_ATTACH_TASK))
-		return false;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (a->cgrp != b->cgrp)
-		return false;
-#endif
-
-	/* If not task event, we're machine wide */
-	if (!(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Events that target same task are placed into the same cache group.
-	 */
-	if (a->hw.target == b->hw.target)
-		return true;
-
-	/*
-	 * Are we an inherited event?
-	 */
-	if (b->parent == a)
-		return true;
-
-	return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-	return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *		   PROHIBITS
- *     system-wide    -> 	cgroup and task
- *     cgroup 	      ->	system-wide
- *     		      ->	task in cgroup
- *     task 	      -> 	system-wide
- *     		      ->	task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-	/*
-	 * We can have any number of cgroups but only one system-wide
-	 * event at a time.
-	 */
-	if (a->cgrp && b->cgrp) {
-		struct perf_cgroup *ac = a->cgrp;
-		struct perf_cgroup *bc = b->cgrp;
-
-		/*
-		 * This condition should have been caught in
-		 * __match_event() and we should be sharing an RMID.
-		 */
-		WARN_ON_ONCE(ac == bc);
-
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-
-	if (a->cgrp || b->cgrp) {
-		struct perf_cgroup *ac, *bc;
-
-		/*
-		 * cgroup and system-wide events are mutually exclusive
-		 */
-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-			return true;
-
-		/*
-		 * Ensure neither event is part of the other's cgroup
-		 */
-		ac = event_to_cgroup(a);
-		bc = event_to_cgroup(b);
-		if (ac == bc)
-			return true;
-
-		/*
-		 * Must have cgroup and non-intersecting task events.
-		 */
-		if (!ac || !bc)
-			return false;
-
-		/*
-		 * We have cgroup and task events, and the task belongs
-		 * to a cgroup. Check for for overlap.
-		 */
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-#endif
-	/*
-	 * If one of them is not a task, same story as above with cgroups.
-	 */
-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
-	    !(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Must be non-overlapping.
-	 */
-	return false;
-}
-
-struct rmid_read {
-	u32 rmid;
-	atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-	struct perf_event *event;
-	struct list_head *head = &group->hw.cqm_group_entry;
-	u32 old_rmid = group->hw.cqm_rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	/*
-	 * If our RMID is being deallocated, perform a read now.
-	 */
-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-		struct rmid_read rr = {
-			.value = ATOMIC64_INIT(0),
-			.rmid = old_rmid,
-		};
-
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-				 &rr, 1);
-		local64_set(&group->count, atomic64_read(&rr.value));
-	}
-
-	raw_spin_lock_irq(&cache_lock);
-
-	group->hw.cqm_rmid = rmid;
-	list_for_each_entry(event, head, hw.cqm_group_entry)
-		event->hw.cqm_rmid = rmid;
-
-	raw_spin_unlock_irq(&cache_lock);
-
-	return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-	struct cqm_rmid_entry *entry;
-
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		if (entry->state != RMID_AVAILABLE)
-			break;
-
-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-			entry->state = RMID_DIRTY;
-	}
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-	struct perf_event *leader, *event;
-
-	lockdep_assert_held(&cache_mutex);
-
-	leader = list_first_entry(&cache_groups, struct perf_event,
-				  hw.cqm_groups_entry);
-	event = leader;
-
-	list_for_each_entry_continue(event, &cache_groups,
-				     hw.cqm_groups_entry) {
-		if (__rmid_valid(event->hw.cqm_rmid))
-			continue;
-
-		if (__conflict_event(event, leader))
-			continue;
-
-		intel_cqm_xchg_rmid(event, rmid);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-	struct cqm_rmid_entry *entry, *tmp;
-
-	lockdep_assert_held(&cache_mutex);
-
-	*available = 0;
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		unsigned long min_queue_time;
-		unsigned long now = jiffies;
-
-		/*
-		 * We hold RMIDs placed into limbo for a minimum queue
-		 * time. Before the minimum queue time has elapsed we do
-		 * not recycle RMIDs.
-		 *
-		 * The reasoning is that until a sufficient time has
-		 * passed since we stopped using an RMID, any RMID
-		 * placed onto the limbo list will likely still have
-		 * data tagged in the cache, which means we'll probably
-		 * fail to recycle it anyway.
-		 *
-		 * We can save ourselves an expensive IPI by skipping
-		 * any RMIDs that have not been queued for the minimum
-		 * time.
-		 */
-		min_queue_time = entry->queue_time +
-			msecs_to_jiffies(__rmid_queue_time_ms);
-
-		if (time_after(min_queue_time, now))
-			break;
-
-		entry->state = RMID_AVAILABLE;
-		(*available)++;
-	}
-
-	/*
-	 * Fast return if none of the RMIDs on the limbo list have been
-	 * sitting on the queue for the minimum queue time.
-	 */
-	if (!*available)
-		return false;
-
-	/*
-	 * Test whether an RMID is free for each package.
-	 */
-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-		/*
-		 * Exhausted all RMIDs that have waited min queue time.
-		 */
-		if (entry->state == RMID_YOUNG)
-			break;
-
-		if (entry->state == RMID_DIRTY)
-			continue;
-
-		list_del(&entry->list);	/* remove from limbo */
-
-		/*
-		 * The rotation RMID gets priority if it's
-		 * currently invalid. In which case, skip adding
-		 * the RMID to the the free lru.
-		 */
-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-			intel_cqm_rotation_rmid = entry->rmid;
-			continue;
-		}
-
-		/*
-		 * If we have groups waiting for RMIDs, hand
-		 * them one now provided they don't conflict.
-		 */
-		if (intel_cqm_sched_in_event(entry->rmid))
-			continue;
-
-		/*
-		 * Otherwise place it onto the free list.
-		 */
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-
-	return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-	struct perf_event *rotor;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	rotor = list_first_entry(&cache_groups, struct perf_event,
-				 hw.cqm_groups_entry);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
-	 * don't need to rotate the list.
-	 */
-	if (next == rotor)
-		return;
-
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-	__put_rmid(rmid);
-
-	list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-	struct perf_event *group, *g;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-		if (group == event)
-			continue;
-
-		rmid = group->hw.cqm_rmid;
-
-		/*
-		 * Skip events that don't have a valid RMID.
-		 */
-		if (!__rmid_valid(rmid))
-			continue;
-
-		/*
-		 * No conflict? No problem! Leave the event alone.
-		 */
-		if (!__conflict_event(group, event))
-			continue;
-
-		intel_cqm_xchg_rmid(group, INVALID_RMID);
-		__put_rmid(rmid);
-	}
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-	struct perf_event *group, *start = NULL;
-	unsigned int threshold_limit;
-	unsigned int nr_needed = 0;
-	unsigned int nr_available;
-	bool rotated = false;
-
-	mutex_lock(&cache_mutex);
-
-again:
-	/*
-	 * Fast path through this function if there are no groups and no
-	 * RMIDs that need cleaning.
-	 */
-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-		if (!__rmid_valid(group->hw.cqm_rmid)) {
-			if (!start)
-				start = group;
-			nr_needed++;
-		}
-	}
-
-	/*
-	 * We have some event groups, but they all have RMIDs assigned
-	 * and no RMIDs need cleaning.
-	 */
-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	if (!nr_needed)
-		goto stabilize;
-
-	/*
-	 * We have more event groups without RMIDs than available RMIDs,
-	 * or we have event groups that conflict with the ones currently
-	 * scheduled.
-	 *
-	 * We force deallocate the rmid of the group at the head of
-	 * cache_groups. The first event group without an RMID then gets
-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
-	 * forward progress.
-	 *
-	 * Rotate the cache_groups list so the previous head is now the
-	 * tail.
-	 */
-	__intel_cqm_pick_and_rotate(start);
-
-	/*
-	 * If the rotation is going to succeed, reduce the threshold so
-	 * that we don't needlessly reuse dirty RMIDs.
-	 */
-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = __get_rmid();
-
-		intel_cqm_sched_out_conflicting_events(start);
-
-		if (__intel_cqm_threshold)
-			__intel_cqm_threshold--;
-	}
-
-	rotated = true;
-
-stabilize:
-	/*
-	 * We now need to stablize the RMID we freed above (if any) to
-	 * ensure that the next time we rotate we have an RMID with zero
-	 * occupancy value.
-	 *
-	 * Alternatively, if we didn't need to perform any rotation,
-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
-	 */
-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-	while (intel_cqm_rmid_stabilize(&nr_available) &&
-	       __intel_cqm_threshold < threshold_limit) {
-		unsigned int steal_limit;
-
-		/*
-		 * Don't spin if nobody is actively waiting for an RMID,
-		 * the rotation worker will be kicked as soon as an
-		 * event needs an RMID anyway.
-		 */
-		if (!nr_needed)
-			break;
-
-		/* Allow max 25% of RMIDs to be in limbo. */
-		steal_limit = (cqm_max_rmid + 1) / 4;
-
-		/*
-		 * We failed to stabilize any RMIDs so our rotation
-		 * logic is now stuck. In order to make forward progress
-		 * we have a few options:
-		 *
-		 *   1. rotate ("steal") another RMID
-		 *   2. increase the threshold
-		 *   3. do nothing
-		 *
-		 * We do both of 1. and 2. until we hit the steal limit.
-		 *
-		 * The steal limit prevents all RMIDs ending up on the
-		 * limbo list. This can happen if every RMID has a
-		 * non-zero occupancy above threshold_limit, and the
-		 * occupancy values aren't dropping fast enough.
-		 *
-		 * Note that there is prioritisation at work here - we'd
-		 * rather increase the number of RMIDs on the limbo list
-		 * than increase the threshold, because increasing the
-		 * threshold skews the event data (because we reuse
-		 * dirty RMIDs) - threshold bumps are a last resort.
-		 */
-		if (nr_available < steal_limit)
-			goto again;
-
-		__intel_cqm_threshold++;
-	}
-
-out:
-	mutex_unlock(&cache_mutex);
-	return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-	unsigned long delay;
-
-	__intel_cqm_rmid_rotate();
-
-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-				  struct perf_event **group)
-{
-	struct perf_event *iter;
-	bool conflict = false;
-	u32 rmid;
-
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		rmid = iter->hw.cqm_rmid;
-
-		if (__match_event(iter, event)) {
-			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = rmid;
-			*group = iter;
-			return;
-		}
-
-		/*
-		 * We only care about conflicts for events that are
-		 * actually scheduled in (and hence have a valid RMID).
-		 */
-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
-			conflict = true;
-	}
-
-	if (conflict)
-		rmid = INVALID_RMID;
-	else
-		rmid = __get_rmid();
-
-	event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-	unsigned long flags;
-	u32 rmid;
-	u64 val;
-
-	/*
-	 * Task events are handled by intel_cqm_event_count().
-	 */
-	if (event->cpu == -1)
-		return;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	rmid = event->hw.cqm_rmid;
-
-	if (!__rmid_valid(rmid))
-		goto out;
-
-	val = __rmid_read(rmid);
-
-	/*
-	 * Ignore this reading on error states and do not update the value.
-	 */
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		goto out;
-
-	local64_set(&event->count, val);
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = __rmid_read(rr->rmid);
-
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-
-	atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-	return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-	unsigned long flags;
-	struct rmid_read rr = {
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/*
-	 * We only need to worry about task events. System-wide events
-	 * are handled like usual, i.e. entirely with
-	 * intel_cqm_event_read().
-	 */
-	if (event->cpu != -1)
-		return __perf_event_count(event);
-
-	/*
-	 * Only the group leader gets to report values. This stops us
-	 * reporting duplicate values to userspace, and gives us a clear
-	 * rule for which task gets to report the values.
-	 *
-	 * Note that it is impossible to attribute these values to
-	 * specific packages - we forfeit that ability when we create
-	 * task events.
-	 */
-	if (!cqm_group_leader(event))
-		return 0;
-
-	/*
-	 * Getting up-to-date values requires an SMP IPI which is not
-	 * possible if we're being called in interrupt context. Return
-	 * the cached values instead.
-	 */
-	if (unlikely(in_interrupt()))
-		goto out;
-
-	/*
-	 * Notice that we don't perform the reading of an RMID
-	 * atomically, because we can't hold a spin lock across the
-	 * IPIs.
-	 *
-	 * Speculatively perform the read, since @event might be
-	 * assigned a different (possibly invalid) RMID while we're
-	 * busying performing the IPI calls. It's therefore necessary to
-	 * check @event's RMID afterwards, and if it has changed,
-	 * discard the result of the read.
-	 */
-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-	if (!__rmid_valid(rr.rmid))
-		goto out;
-
-	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	if (event->hw.cqm_rmid == rr.rmid)
-		local64_set(&event->count, atomic64_read(&rr.value));
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-	return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid = event->hw.cqm_rmid;
-
-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-		return;
-
-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-	if (state->rmid_usecnt++) {
-		if (!WARN_ON_ONCE(state->rmid != rmid))
-			return;
-	} else {
-		WARN_ON_ONCE(state->rmid);
-	}
-
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-	if (event->hw.cqm_state & PERF_HES_STOPPED)
-		return;
-
-	event->hw.cqm_state |= PERF_HES_STOPPED;
-
-	intel_cqm_event_read(event);
-
-	if (!--state->rmid_usecnt) {
-		state->rmid = 0;
-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-	} else {
-		WARN_ON_ONCE(!state->rmid);
-	}
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-	unsigned long flags;
-	u32 rmid;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	event->hw.cqm_state = PERF_HES_STOPPED;
-	rmid = event->hw.cqm_rmid;
-
-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-		intel_cqm_event_start(event, mode);
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-	struct perf_event *group_other = NULL;
-
-	mutex_lock(&cache_mutex);
-
-	/*
-	 * If there's another event in this group...
-	 */
-	if (!list_empty(&event->hw.cqm_group_entry)) {
-		group_other = list_first_entry(&event->hw.cqm_group_entry,
-					       struct perf_event,
-					       hw.cqm_group_entry);
-		list_del(&event->hw.cqm_group_entry);
-	}
-
-	/*
-	 * And we're the group leader..
-	 */
-	if (cqm_group_leader(event)) {
-		/*
-		 * If there was a group_other, make that leader, otherwise
-		 * destroy the group and return the RMID.
-		 */
-		if (group_other) {
-			list_replace(&event->hw.cqm_groups_entry,
-				     &group_other->hw.cqm_groups_entry);
-		} else {
-			u32 rmid = event->hw.cqm_rmid;
-
-			if (__rmid_valid(rmid))
-				__put_rmid(rmid);
-			list_del(&event->hw.cqm_groups_entry);
-		}
-	}
-
-	mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-	struct perf_event *group = NULL;
-	bool rotate = false;
-
-	if (event->attr.type != intel_cqm_pmu.type)
-		return -ENOENT;
-
-	if (event->attr.config & ~QOS_EVENT_MASK)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-	event->destroy = intel_cqm_event_destroy;
-
-	mutex_lock(&cache_mutex);
-
-	/* Will also set rmid */
-	intel_cqm_setup_event(event, &group);
-
-	if (group) {
-		list_add_tail(&event->hw.cqm_group_entry,
-			      &group->hw.cqm_group_entry);
-	} else {
-		list_add_tail(&event->hw.cqm_groups_entry,
-			      &cache_groups);
-
-		/*
-		 * All RMIDs are either in use or have recently been
-		 * used. Kick the rotation worker to clean/free some.
-		 *
-		 * We only do this for the group leader, rather than for
-		 * every event in a group to save on needless work.
-		 */
-		if (!__rmid_valid(event->hw.cqm_rmid))
-			rotate = true;
-	}
-
-	mutex_unlock(&cache_mutex);
-
-	if (rotate)
-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-	return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-static struct attribute *intel_cqm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-	.name = "events",
-	.attrs = intel_cqm_events_attr,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-	.name = "format",
-	.attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-			   char *page)
-{
-	ssize_t rv;
-
-	mutex_lock(&cache_mutex);
-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-	mutex_unlock(&cache_mutex);
-
-	return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int bytes, cachelines;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &bytes);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cache_mutex);
-
-	__intel_cqm_max_threshold = bytes;
-	cachelines = bytes / cqm_l3_scale;
-
-	/*
-	 * The new maximum takes effect immediately.
-	 */
-	if (__intel_cqm_threshold > cachelines)
-		__intel_cqm_threshold = cachelines;
-
-	mutex_unlock(&cache_mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-	&dev_attr_max_recycle_threshold.attr,
-	NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-	.attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-	&intel_cqm_events_group,
-	&intel_cqm_format_group,
-	&intel_cqm_group,
-	NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-	.attr_groups	     = intel_cqm_attr_groups,
-	.task_ctx_nr	     = perf_sw_context,
-	.event_init	     = intel_cqm_event_init,
-	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_stop,
-	.start		     = intel_cqm_event_start,
-	.stop		     = intel_cqm_event_stop,
-	.read		     = intel_cqm_event_read,
-	.count		     = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
-
-	for_each_cpu(i, &cqm_cpumask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;	/* already got reader for this socket */
-	}
-
-	cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static void intel_cqm_cpu_starting(unsigned int cpu)
-{
-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	state->rmid = 0;
-	state->closid = 0;
-	state->rmid_usecnt = 0;
-
-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-}
-
-static void intel_cqm_cpu_exit(unsigned int cpu)
-{
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
-
-	/*
-	 * Is @cpu a designated cqm reader?
-	 */
-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-		return;
-
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-
-		if (phys_id == topology_physical_package_id(i)) {
-			cpumask_set_cpu(i, &cqm_cpumask);
-			break;
-		}
-	}
-}
-
-static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu  = (unsigned long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		intel_cqm_cpu_exit(cpu);
-		break;
-	case CPU_STARTING:
-		intel_cqm_cpu_starting(cpu);
-		cqm_pick_event_reader(cpu);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-	{}
-};
-
-static int __init intel_cqm_init(void)
-{
-	char *str, scale[20];
-	int i, cpu, ret;
-
-	if (!x86_match_cpu(intel_cqm_match))
-		return -ENODEV;
-
-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-	/*
-	 * It's possible that not all resources support the same number
-	 * of RMIDs. Instead of making scheduling much more complicated
-	 * (where we have to match a task's RMID to a cpu that supports
-	 * that many RMIDs) just find the minimum RMIDs supported across
-	 * all cpus.
-	 *
-	 * Also, check that the scales match on all cpus.
-	 */
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->x86_cache_max_rmid < cqm_max_rmid)
-			cqm_max_rmid = c->x86_cache_max_rmid;
-
-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
-			pr_err("Multiple LLC scale values, disabling\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * A reasonable upper limit on the max threshold is the number
-	 * of lines tagged per RMID if all RMIDs have the same number of
-	 * lines tagged in the LLC.
-	 *
-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-	 */
-	__intel_cqm_max_threshold =
-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-	str = kstrdup(scale, GFP_KERNEL);
-	if (!str) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	event_attr_intel_cqm_llc_scale.event_str = str;
-
-	ret = intel_cqm_setup_rmid_cache();
-	if (ret)
-		goto out;
-
-	for_each_online_cpu(i) {
-		intel_cqm_cpu_starting(i);
-		cqm_pick_event_reader(i);
-	}
-
-	__perf_cpu_notifier(intel_cqm_cpu_notifier);
-
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret)
-		pr_err("Intel CQM perf registration failed: %d\n", ret);
-	else
-		pr_info("Intel CQM monitoring enabled\n");
-
-out:
-	cpu_notifier_register_done();
-
-	return ret;
-}
-device_initcall(intel_cqm_init);
-- 
cgit 


From 6aec1ad7365661315e9ab13b17eeb97ab3c38176 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:10 +0100
Subject: perf/x86: Move perf_event_intel_cstate.c ..... =>
 x86/events/intel/cstate.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                      |   1 +
 arch/x86/events/intel/cstate.c                | 694 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                  |   1 -
 arch/x86/kernel/cpu/perf_event_intel_cstate.c | 694 --------------------------
 4 files changed, 695 insertions(+), 695 deletions(-)
 create mode 100644 arch/x86/events/intel/cstate.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_cstate.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f97c283ec7a2..e0a70fea7332 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,3 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
new file mode 100644
index 000000000000..1bbf37e8604b
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *	MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *			 perf code: 0x00
+ *			 Available model: SLM,AMT
+ *			 Scope: Core (each processor core has a MSR)
+ *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *			       perf code: 0x03
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *			       perf code: 0x00
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *			       perf code: 0x03
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *			       perf code: 0x04
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *			       perf code: 0x05
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *			       perf code: 0x06
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf);
+
+struct perf_cstate_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+	/*
+	 * cstate_core events
+	 */
+	PERF_CSTATE_CORE_C1_RES = 0,
+	PERF_CSTATE_CORE_C3_RES,
+	PERF_CSTATE_CORE_C6_RES,
+	PERF_CSTATE_CORE_C7_RES,
+
+	PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C1_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
+	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
+	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
+	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+	.name = "events",
+	.attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+	&format_attr_core_event.attr,
+	NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+	.name = "format",
+	.attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+	&core_events_attr_group,
+	&core_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+	/*
+	 * cstate_pkg events
+	 */
+	PERF_CSTATE_PKG_C2_RES = 0,
+	PERF_CSTATE_PKG_C3_RES,
+	PERF_CSTATE_PKG_C6_RES,
+	PERF_CSTATE_PKG_C7_RES,
+	PERF_CSTATE_PKG_C8_RES,
+	PERF_CSTATE_PKG_C9_RES,
+	PERF_CSTATE_PKG_C10_RES,
+
+	PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 69: /* 22nm Haswell ULT */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES ||
+		    idx == PERF_CSTATE_PKG_C8_RES ||
+		    idx == PERF_CSTATE_PKG_C9_RES ||
+		    idx == PERF_CSTATE_PKG_C10_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
+	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
+	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
+	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
+	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
+	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
+	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+	.name = "events",
+	.attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+	&format_attr_pkg_event.attr,
+	NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+	.name = "format",
+	.attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+	&pkg_events_attr_group,
+	&pkg_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu == &cstate_core_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+	else if (pmu == &cstate_pkg_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+	else
+		return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+	int ret = 0;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (event->pmu == &cstate_core_pmu) {
+		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+			return -EINVAL;
+		if (!core_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = core_msr[cfg].msr;
+	} else if (event->pmu == &cstate_pkg_pmu) {
+		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+			return -EINVAL;
+		if (!pkg_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = pkg_msr[cfg].msr;
+	} else
+		return -ENOENT;
+
+	/* must be done before validate_group */
+	event->hw.config = cfg;
+	event->hw.idx = -1;
+
+	return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+	u64 val;
+
+	rdmsrl(event->hw.event_base, val);
+	return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = cstate_pmu_read_counter(event);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count)
+		goto again;
+
+	local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+	local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+	if (mode & PERF_EF_START)
+		cstate_pmu_event_start(event, mode);
+
+	return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+	int i, id, target;
+
+	/* cpu exit for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_core_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_core_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+	}
+
+	/* cpu exit for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_physical_package_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+	}
+}
+
+static void cstate_cpu_init(int cpu)
+{
+	int i, id;
+
+	/* cpu init for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		for_each_cpu(i, &cstate_core_cpu_mask) {
+			if (id == topology_core_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+	}
+
+	/* cpu init for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		for_each_cpu(i, &cstate_pkg_cpu_mask) {
+			if (id == topology_physical_package_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+	}
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		break;
+	case CPU_STARTING:
+		cstate_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		break;
+	case CPU_DOWN_PREPARE:
+		cstate_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+			     struct attribute	**events_attrs,
+			     int max_event_nr)
+{
+	int i, j = 0;
+	u64 val;
+
+	/* Probe the cstate events. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining events in the sysfs attrs. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+	/* SLM has different MSR for PKG C6 */
+	switch (boot_cpu_data.x86_model) {
+	case 55:
+	case 76:
+	case 77:
+		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+	}
+
+	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+		has_cstate_core = true;
+
+	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+		has_cstate_pkg = true;
+
+	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+	int cpu;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu)
+		cstate_cpu_init(cpu);
+
+	__perf_cpu_notifier(cstate_cpu_notifier);
+
+	cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+	.attr_groups	= core_attr_groups,
+	.name		= "cstate_core",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+	.attr_groups	= pkg_attr_groups,
+	.name		= "cstate_pkg",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+	int err;
+
+	if (has_cstate_core) {
+		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_core_pmu.name, err);
+	}
+
+	if (has_cstate_pkg) {
+		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_pkg_pmu.name, err);
+	}
+}
+
+static int __init cstate_pmu_init(void)
+{
+	int err;
+
+	if (cpu_has_hypervisor)
+		return -ENODEV;
+
+	err = cstate_init();
+	if (err)
+		return err;
+
+	cstate_cpumask_init();
+
+	cstate_pmus_register();
+
+	return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 64781cfcbb2b..1bc881bd91ea 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -35,7 +35,6 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
deleted file mode 100644
index 75a38b5a2e26..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * perf_event_intel_cstate.c: support cstate residency counters
- *
- * Copyright (C) 2015, Intel Corp.
- * Author: Kan Liang (kan.liang@intel.com)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- */
-
-/*
- * This file export cstate related free running (read-only) counters
- * for perf. These counters may be use simultaneously by other tools,
- * such as turbostat. However, it still make sense to implement them
- * in perf. Because we can conveniently collect them together with
- * other events, and allow to use them from tools without special MSR
- * access code.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it is not supported by the hardware.
- *
- * According to counters' scope and category, two PMUs are registered
- * with the perf_event core subsystem.
- *  - 'cstate_core': The counter is available for each physical core.
- *    The counters include CORE_C*_RESIDENCY.
- *  - 'cstate_pkg': The counter is available for each physical package.
- *    The counters include PKG_C*_RESIDENCY.
- *
- * All of these counters are specified in the Intel® 64 and IA-32
- * Architectures Software Developer.s Manual Vol3b.
- *
- * Model specific counters:
- *	MSR_CORE_C1_RES: CORE C1 Residency Counter
- *			 perf code: 0x00
- *			 Available model: SLM,AMT
- *			 Scope: Core (each processor core has a MSR)
- *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
- *			       perf code: 0x01
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
- *			       perf code: 0x02
- *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
- *			       perf code: 0x03
- *			       Available model: SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
- *			       perf code: 0x00
- *			       Available model: SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
- *			       perf code: 0x01
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
- *			       perf code: 0x02
- *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
- *			       perf code: 0x03
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
- *			       perf code: 0x04
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *	MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
- *			       perf code: 0x05
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
- *			       perf code: 0x06
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
-static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
-				struct kobj_attribute *attr,	\
-				char *page)			\
-{								\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
-	return sprintf(page, _format "\n");			\
-}								\
-static struct kobj_attribute format_attr_##_var =		\
-	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-				       struct device_attribute *attr,
-				       char *buf);
-
-struct perf_cstate_msr {
-	u64	msr;
-	struct	perf_pmu_events_attr *attr;
-	bool	(*test)(int idx);
-};
-
-
-/* cstate_core PMU */
-
-static struct pmu cstate_core_pmu;
-static bool has_cstate_core;
-
-enum perf_cstate_core_id {
-	/*
-	 * cstate_core events
-	 */
-	PERF_CSTATE_CORE_C1_RES = 0,
-	PERF_CSTATE_CORE_C3_RES,
-	PERF_CSTATE_CORE_C6_RES,
-	PERF_CSTATE_CORE_C7_RES,
-
-	PERF_CSTATE_CORE_EVENT_MAX,
-};
-
-bool test_core(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C1_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
-
-static struct perf_cstate_msr core_msr[] = {
-	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
-	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
-	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
-	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
-};
-
-static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group core_events_attr_group = {
-	.name = "events",
-	.attrs = core_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
-	&format_attr_core_event.attr,
-	NULL,
-};
-
-static struct attribute_group core_format_attr_group = {
-	.name = "format",
-	.attrs = core_format_attrs,
-};
-
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
-	.attrs = cstate_cpumask_attrs,
-};
-
-static const struct attribute_group *core_attr_groups[] = {
-	&core_events_attr_group,
-	&core_format_attr_group,
-	&cpumask_attr_group,
-	NULL,
-};
-
-/* cstate_core PMU end */
-
-
-/* cstate_pkg PMU */
-
-static struct pmu cstate_pkg_pmu;
-static bool has_cstate_pkg;
-
-enum perf_cstate_pkg_id {
-	/*
-	 * cstate_pkg events
-	 */
-	PERF_CSTATE_PKG_C2_RES = 0,
-	PERF_CSTATE_PKG_C3_RES,
-	PERF_CSTATE_PKG_C6_RES,
-	PERF_CSTATE_PKG_C7_RES,
-	PERF_CSTATE_PKG_C8_RES,
-	PERF_CSTATE_PKG_C9_RES,
-	PERF_CSTATE_PKG_C10_RES,
-
-	PERF_CSTATE_PKG_EVENT_MAX,
-};
-
-bool test_pkg(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 69: /* 22nm Haswell ULT */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES ||
-		    idx == PERF_CSTATE_PKG_C8_RES ||
-		    idx == PERF_CSTATE_PKG_C9_RES ||
-		    idx == PERF_CSTATE_PKG_C10_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
-PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
-PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
-PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
-
-static struct perf_cstate_msr pkg_msr[] = {
-	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
-	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
-	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
-	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
-	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
-	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
-	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
-};
-
-static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group pkg_events_attr_group = {
-	.name = "events",
-	.attrs = pkg_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
-	&format_attr_pkg_event.attr,
-	NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
-	.name = "format",
-	.attrs = pkg_format_attrs,
-};
-
-static cpumask_t cstate_pkg_cpu_mask;
-
-static const struct attribute_group *pkg_attr_groups[] = {
-	&pkg_events_attr_group,
-	&pkg_format_attr_group,
-	&cpumask_attr_group,
-	NULL,
-};
-
-/* cstate_pkg PMU end*/
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-				       struct device_attribute *attr,
-				       char *buf)
-{
-	struct pmu *pmu = dev_get_drvdata(dev);
-
-	if (pmu == &cstate_core_pmu)
-		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
-	else if (pmu == &cstate_pkg_pmu)
-		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
-	else
-		return 0;
-}
-
-static int cstate_pmu_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config;
-	int ret = 0;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	if (event->pmu == &cstate_core_pmu) {
-		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
-			return -EINVAL;
-		if (!core_msr[cfg].attr)
-			return -EINVAL;
-		event->hw.event_base = core_msr[cfg].msr;
-	} else if (event->pmu == &cstate_pkg_pmu) {
-		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
-			return -EINVAL;
-		if (!pkg_msr[cfg].attr)
-			return -EINVAL;
-		event->hw.event_base = pkg_msr[cfg].msr;
-	} else
-		return -ENOENT;
-
-	/* must be done before validate_group */
-	event->hw.config = cfg;
-	event->hw.idx = -1;
-
-	return ret;
-}
-
-static inline u64 cstate_pmu_read_counter(struct perf_event *event)
-{
-	u64 val;
-
-	rdmsrl(event->hw.event_base, val);
-	return val;
-}
-
-static void cstate_pmu_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev_raw_count, new_raw_count;
-
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = cstate_pmu_read_counter(event);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			    new_raw_count) != prev_raw_count)
-		goto again;
-
-	local64_add(new_raw_count - prev_raw_count, &event->count);
-}
-
-static void cstate_pmu_event_start(struct perf_event *event, int mode)
-{
-	local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
-}
-
-static void cstate_pmu_event_stop(struct perf_event *event, int mode)
-{
-	cstate_pmu_event_update(event);
-}
-
-static void cstate_pmu_event_del(struct perf_event *event, int mode)
-{
-	cstate_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int cstate_pmu_event_add(struct perf_event *event, int mode)
-{
-	if (mode & PERF_EF_START)
-		cstate_pmu_event_start(event, mode);
-
-	return 0;
-}
-
-static void cstate_cpu_exit(int cpu)
-{
-	int i, id, target;
-
-	/* cpu exit for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_core_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
-			cpumask_set_cpu(target, &cstate_core_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-		if (target >= 0)
-			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
-	}
-
-	/* cpu exit for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_physical_package_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
-			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-		if (target >= 0)
-			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
-	}
-}
-
-static void cstate_cpu_init(int cpu)
-{
-	int i, id;
-
-	/* cpu init for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		for_each_cpu(i, &cstate_core_cpu_mask) {
-			if (id == topology_core_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-	}
-
-	/* cpu init for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		for_each_cpu(i, &cstate_pkg_cpu_mask) {
-			if (id == topology_physical_package_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-	}
-}
-
-static int cstate_cpu_notifier(struct notifier_block *self,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		break;
-	case CPU_STARTING:
-		cstate_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		break;
-	case CPU_DOWN_PREPARE:
-		cstate_cpu_exit(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-			     struct attribute	**events_attrs,
-			     int max_event_nr)
-{
-	int i, j = 0;
-	u64 val;
-
-	/* Probe the cstate events. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-			msr[i].attr = NULL;
-	}
-
-	/* List remaining events in the sysfs attrs. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (msr[i].attr)
-			events_attrs[j++] = &msr[i].attr->attr.attr;
-	}
-	events_attrs[j] = NULL;
-
-	return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-	/* SLM has different MSR for PKG C6 */
-	switch (boot_cpu_data.x86_model) {
-	case 55:
-	case 76:
-	case 77:
-		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-	}
-
-	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-		has_cstate_core = true;
-
-	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-		has_cstate_pkg = true;
-
-	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-	int cpu;
-
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu)
-		cstate_cpu_init(cpu);
-
-	__perf_cpu_notifier(cstate_cpu_notifier);
-
-	cpu_notifier_register_done();
-}
-
-static struct pmu cstate_core_pmu = {
-	.attr_groups	= core_attr_groups,
-	.name		= "cstate_core",
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
-	.start		= cstate_pmu_event_start,
-	.stop		= cstate_pmu_event_stop,
-	.read		= cstate_pmu_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static struct pmu cstate_pkg_pmu = {
-	.attr_groups	= pkg_attr_groups,
-	.name		= "cstate_pkg",
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
-	.start		= cstate_pmu_event_start,
-	.stop		= cstate_pmu_event_stop,
-	.read		= cstate_pmu_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static void __init cstate_pmus_register(void)
-{
-	int err;
-
-	if (has_cstate_core) {
-		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_core_pmu.name, err);
-	}
-
-	if (has_cstate_pkg) {
-		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_pkg_pmu.name, err);
-	}
-}
-
-static int __init cstate_pmu_init(void)
-{
-	int err;
-
-	if (cpu_has_hypervisor)
-		return -ENODEV;
-
-	err = cstate_init();
-	if (err)
-		return err;
-
-	cstate_cpumask_init();
-
-	cstate_pmus_register();
-
-	return 0;
-}
-
-device_initcall(cstate_pmu_init);
-- 
cgit 


From 7010d129137aa44a79a6a1911881e07e1cd5da60 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:11 +0100
Subject: perf/x86: Move perf_event_intel_ds.c ......... =>
 x86/events/intel/ds.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                  |    2 +-
 arch/x86/events/intel/ds.c                | 1368 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile              |    2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 1368 -----------------------------
 4 files changed, 1370 insertions(+), 1370 deletions(-)
 create mode 100644 arch/x86/events/intel/ds.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_ds.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index e0a70fea7332..c4d41b66d167 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,4 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
new file mode 100644
index 000000000000..9677207a2ab1
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,1368 @@
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+#define PEBS_FIXUP_SIZE		PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+	u32 flags, ip;
+	u32 ax, bc, cx, dx;
+	u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+	u64 val;
+	struct {
+		unsigned int ld_dse:4;
+		unsigned int ld_stlb_miss:1;
+		unsigned int ld_locked:1;
+		unsigned int ld_reserved:26;
+	};
+	struct {
+		unsigned int st_l1d_hit:1;
+		unsigned int st_reserved1:3;
+		unsigned int st_stlb_miss:1;
+		unsigned int st_locked:1;
+		unsigned int st_reserved2:26;
+	};
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+static const u64 pebs_data_source[] = {
+	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+static u64 precise_store_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+	dse.val = status;
+
+	/*
+	 * bit 4: TLB access
+	 * 1 = stored missed 2nd level TLB
+	 *
+	 * so it either hit the walker or the OS
+	 * otherwise hit 2nd level TLB
+	 */
+	if (dse.st_stlb_miss)
+		val |= P(TLB, MISS);
+	else
+		val |= P(TLB, HIT);
+
+	/*
+	 * bit 0: hit L1 data cache
+	 * if not set, then all we know is that
+	 * it missed L1D
+	 */
+	if (dse.st_l1d_hit)
+		val |= P(LVL, HIT);
+	else
+		val |= P(LVL, MISS);
+
+	/*
+	 * bit 5: Locked prefix
+	 */
+	if (dse.st_locked)
+		val |= P(LOCK, LOCKED);
+
+	return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+	union perf_mem_data_src dse;
+
+	dse.val = PERF_MEM_NA;
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+		dse.mem_op = PERF_MEM_OP_STORE;
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+		dse.mem_op = PERF_MEM_OP_LOAD;
+
+	/*
+	 * L1 info only valid for following events:
+	 *
+	 * MEM_UOPS_RETIRED.STLB_MISS_STORES
+	 * MEM_UOPS_RETIRED.LOCK_STORES
+	 * MEM_UOPS_RETIRED.SPLIT_STORES
+	 * MEM_UOPS_RETIRED.ALL_STORES
+	 */
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+		if (status & 1)
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+		else
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+	}
+	return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val;
+	int model = boot_cpu_data.x86_model;
+	int fam = boot_cpu_data.x86;
+
+	dse.val = status;
+
+	/*
+	 * use the mapping table for bit 0-3
+	 */
+	val = pebs_data_source[dse.ld_dse];
+
+	/*
+	 * Nehalem models do not support TLB, Lock infos
+	 */
+	if (fam == 0x6 && (model == 26 || model == 30
+	    || model == 31 || model == 46)) {
+		val |= P(TLB, NA) | P(LOCK, NA);
+		return val;
+	}
+	/*
+	 * bit 4: TLB access
+	 * 0 = did not miss 2nd level TLB
+	 * 1 = missed 2nd level TLB
+	 */
+	if (dse.ld_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	/*
+	 * bit 5: locked prefix
+	 */
+	if (dse.ld_locked)
+		val |= P(LOCK, LOCKED);
+
+	return val;
+}
+
+struct pebs_record_core {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+	struct {
+		u32 cycles_last_block     : 32,
+		    hle_abort		  : 1,
+		    rtm_abort		  : 1,
+		    instruction_abort     : 1,
+		    non_instruction_abort : 1,
+		    retry		  : 1,
+		    data_conflict	  : 1,
+		    capacity_writes	  : 1,
+		    capacity_reads	  : 1;
+	};
+	u64	    value;
+};
+
+#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+	u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static int alloc_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max;
+	void *buffer, *ibuffer;
+
+	if (!x86_pmu.pebs)
+		return 0;
+
+	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+	if (unlikely(!buffer))
+		return -ENOMEM;
+
+	/*
+	 * HSW+ already provides us the eventing ip; no need to allocate this
+	 * buffer then.
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 2) {
+		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+		if (!ibuffer) {
+			kfree(buffer);
+			return -ENOMEM;
+		}
+		per_cpu(insn_buffer, cpu) = ibuffer;
+	}
+
+	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	ds->pebs_index = ds->pebs_buffer_base;
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+		max * x86_pmu.pebs_record_size;
+
+	return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	kfree(per_cpu(insn_buffer, cpu));
+	per_cpu(insn_buffer, cpu) = NULL;
+
+	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max, thresh;
+	void *buffer;
+
+	if (!x86_pmu.bts)
+		return 0;
+
+	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	if (unlikely(!buffer)) {
+		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+		return -ENOMEM;
+	}
+
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	thresh = max / 16;
+
+	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	ds->bts_index = ds->bts_buffer_base;
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+		max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+		thresh * BTS_RECORD_SIZE;
+
+	return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.bts)
+		return;
+
+	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	struct debug_store *ds;
+
+	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+	if (unlikely(!ds))
+		return -ENOMEM;
+
+	per_cpu(cpu_hw_events, cpu).ds = ds;
+
+	return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	per_cpu(cpu_hw_events, cpu).ds = NULL;
+	kfree(ds);
+}
+
+void release_ds_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		release_pebs_buffer(cpu);
+		release_bts_buffer(cpu);
+		release_ds_buffer(cpu);
+	}
+	put_online_cpus();
+}
+
+void reserve_ds_buffers(void)
+{
+	int bts_err = 0, pebs_err = 0;
+	int cpu;
+
+	x86_pmu.bts_active = 0;
+	x86_pmu.pebs_active = 0;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	if (!x86_pmu.bts)
+		bts_err = 1;
+
+	if (!x86_pmu.pebs)
+		pebs_err = 1;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		if (alloc_ds_buffer(cpu)) {
+			bts_err = 1;
+			pebs_err = 1;
+		}
+
+		if (!bts_err && alloc_bts_buffer(cpu))
+			bts_err = 1;
+
+		if (!pebs_err && alloc_pebs_buffer(cpu))
+			pebs_err = 1;
+
+		if (bts_err && pebs_err)
+			break;
+	}
+
+	if (bts_err) {
+		for_each_possible_cpu(cpu)
+			release_bts_buffer(cpu);
+	}
+
+	if (pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_pebs_buffer(cpu);
+	}
+
+	if (bts_err && pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_ds_buffer(cpu);
+	} else {
+		if (x86_pmu.bts && !bts_err)
+			x86_pmu.bts_active = 1;
+
+		if (x86_pmu.pebs && !pebs_err)
+			x86_pmu.pebs_active = 1;
+
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= DEBUGCTLMSR_TR;
+	debugctlmsr |= DEBUGCTLMSR_BTS;
+	if (config & ARCH_PERFMON_EVENTSEL_INT)
+		debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *base, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	unsigned long skip = 0;
+	struct pt_regs regs;
+
+	if (!event)
+		return 0;
+
+	if (!x86_pmu.bts_active)
+		return 0;
+
+	base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top  = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= base)
+		return 0;
+
+	memset(&regs, 0, sizeof(regs));
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/*
+	 * BTS leaks kernel addresses in branches across the cpl boundary,
+	 * such as traps or system calls, so unless the user is asking for
+	 * kernel tracing (and right now it's not possible), we'd need to
+	 * filter them out. But first we need to count how many of those we
+	 * have in the current batch. This is an extra O(n) pass, however,
+	 * it's much faster than the other one especially considering that
+	 * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+	 * alloc_bts_buffer()).
+	 */
+	for (at = base; at < top; at++) {
+		/*
+		 * Note that right now *this* BTS code only works if
+		 * attr::exclude_kernel is set, but let's keep this extra
+		 * check here in case that changes.
+		 */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			skip++;
+	}
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event, header.size *
+			      (top - base - skip)))
+		return 1;
+
+	for (at = base; at < top; at++) {
+		/* Filter out any records that contain kernel addresses. */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			continue;
+
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+	return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+	struct pt_regs regs;
+
+	x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (!sched_in)
+		intel_pmu_drain_pebs_buffer();
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+        EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (!event->attr.precise_ip)
+		return NULL;
+
+	if (x86_pmu.pebs_constraints) {
+		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+			if ((event->hw.config & c->cmask) == c->code) {
+				event->hw.flags |= c->flags;
+				return c;
+			}
+		}
+	}
+
+	return &emptyconstraint;
+}
+
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool first_pebs;
+	u64 threshold;
+
+	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+	first_pebs = !pebs_is_enabled(cpuc);
+	cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+		cpuc->pebs_enabled |= 1ULL << 63;
+
+	/*
+	 * When the event is constrained enough we can use a larger
+	 * threshold and run the event with less frequent PMI.
+	 */
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+		if (first_pebs)
+			perf_sched_cb_inc(event->ctx->pmu);
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+		/*
+		 * If not all events can use larger buffer,
+		 * roll back to threshold = 1
+		 */
+		if (!first_pebs &&
+		    (ds->pebs_interrupt_threshold > threshold))
+			perf_sched_cb_dec(event->ctx->pmu);
+	}
+
+	/* Use auto-reload if possible to save a MSR write in the PMI */
+	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+		ds->pebs_event_reset[hwc->idx] =
+			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+	}
+
+	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+		ds->pebs_interrupt_threshold = threshold;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool large_pebs = ds->pebs_interrupt_threshold >
+		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+	if (large_pebs)
+		intel_pmu_drain_pebs_buffer();
+
+	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+		cpuc->pebs_enabled &= ~(1ULL << 63);
+
+	if (large_pebs && !pebs_is_enabled(cpuc))
+		perf_sched_cb_dec(event->ctx->pmu);
+
+	if (cpuc->enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	unsigned long from = cpuc->lbr_entries[0].from;
+	unsigned long old_to, to = cpuc->lbr_entries[0].to;
+	unsigned long ip = regs->ip;
+	int is_64bit = 0;
+	void *kaddr;
+	int size;
+
+	/*
+	 * We don't need to fixup if the PEBS assist is fault like
+	 */
+	if (!x86_pmu.intel_cap.pebs_trap)
+		return 1;
+
+	/*
+	 * No LBR entry, no basic block, no rewinding
+	 */
+	if (!cpuc->lbr_stack.nr || !from || !to)
+		return 0;
+
+	/*
+	 * Basic blocks should never cross user/kernel boundaries
+	 */
+	if (kernel_ip(ip) != kernel_ip(to))
+		return 0;
+
+	/*
+	 * unsigned math, either ip is before the start (impossible) or
+	 * the basic block is larger than 1 page (sanity)
+	 */
+	if ((ip - to) > PEBS_FIXUP_SIZE)
+		return 0;
+
+	/*
+	 * We sampled a branch insn, rewind using the LBR stack
+	 */
+	if (ip == to) {
+		set_linear_ip(regs, from);
+		return 1;
+	}
+
+	size = ip - to;
+	if (!kernel_ip(ip)) {
+		int bytes;
+		u8 *buf = this_cpu_read(insn_buffer);
+
+		/* 'size' must fit our buffer, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != 0)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
+	do {
+		struct insn insn;
+
+		old_to = to;
+
+#ifdef CONFIG_X86_64
+		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+		insn_init(&insn, kaddr, size, is_64bit);
+		insn_get_length(&insn);
+		/*
+		 * Make sure there was not a problem decoding the
+		 * instruction and getting the length.  This is
+		 * doubly important because we have an infinite
+		 * loop if insn.length=0.
+		 */
+		if (!insn.length)
+			break;
+
+		to += insn.length;
+		kaddr += insn.length;
+		size -= insn.length;
+	} while (to < ip);
+
+	if (to == ip) {
+		set_linear_ip(regs, old_to);
+		return 1;
+	}
+
+	/*
+	 * Even though we decoded the basic block, the instruction stream
+	 * never matched the given IP, either the TO or the IP got corrupted.
+	 */
+	return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+	if (pebs->tsx_tuning) {
+		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+		return tsx.cycles_last_block;
+	}
+	return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+	/* For RTM XABORTs also log the abort code from AX */
+	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+	return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs,
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+		(PERF_X86_EVENT_PEBS_ST_HSW | \
+		 PERF_X86_EVENT_PEBS_LD_HSW | \
+		 PERF_X86_EVENT_PEBS_NA_HSW)
+	/*
+	 * We cast to the biggest pebs_record but are careful not to
+	 * unconditionally access the 'extra' entries.
+	 */
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct pebs_record_skl *pebs = __pebs;
+	u64 sample_type;
+	int fll, fst, dsrc;
+	int fl = event->hw.flags;
+
+	if (pebs == NULL)
+		return;
+
+	sample_type = event->attr.sample_type;
+	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+	perf_sample_data_init(data, 0, event->hw.last_period);
+
+	data->period = event->hw.last_period;
+
+	/*
+	 * Use latency for weight (only avail with PEBS-LL)
+	 */
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+		data->weight = pebs->lat;
+
+	/*
+	 * data.data_src encodes the data source
+	 */
+	if (dsrc) {
+		u64 val = PERF_MEM_NA;
+		if (fll)
+			val = load_latency_data(pebs->dse);
+		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+			val = precise_datala_hsw(event, pebs->dse);
+		else if (fst)
+			val = precise_store_data(pebs->dse);
+		data->data_src.val = val;
+	}
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	*regs = *iregs;
+	regs->flags = pebs->flags;
+	set_linear_ip(regs, pebs->ip);
+	regs->bp = pebs->bp;
+	regs->sp = pebs->sp;
+
+	if (sample_type & PERF_SAMPLE_REGS_INTR) {
+		regs->ax = pebs->ax;
+		regs->bx = pebs->bx;
+		regs->cx = pebs->cx;
+		regs->dx = pebs->dx;
+		regs->si = pebs->si;
+		regs->di = pebs->di;
+		regs->bp = pebs->bp;
+		regs->sp = pebs->sp;
+
+		regs->flags = pebs->flags;
+#ifndef CONFIG_X86_32
+		regs->r8 = pebs->r8;
+		regs->r9 = pebs->r9;
+		regs->r10 = pebs->r10;
+		regs->r11 = pebs->r11;
+		regs->r12 = pebs->r12;
+		regs->r13 = pebs->r13;
+		regs->r14 = pebs->r14;
+		regs->r15 = pebs->r15;
+#endif
+	}
+
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+		regs->ip = pebs->real_ip;
+		regs->flags |= PERF_EFLAGS_EXACT;
+	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+		regs->flags |= PERF_EFLAGS_EXACT;
+	else
+		regs->flags &= ~PERF_EFLAGS_EXACT;
+
+	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	    x86_pmu.intel_cap.pebs_format >= 1)
+		data->addr = pebs->dla;
+
+	if (x86_pmu.intel_cap.pebs_format >= 2) {
+		/* Only set the TSX weight when no memory weight. */
+		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+			data->weight = intel_hsw_weight(pebs);
+
+		if (sample_type & PERF_SAMPLE_TRANSACTION)
+			data->txn = intel_hsw_transaction(pebs);
+	}
+
+	/*
+	 * v3 supplies an accurate time stamp, so we use that
+	 * for the time stamp.
+	 *
+	 * We can only do this for the default trace clock.
+	 */
+	if (x86_pmu.intel_cap.pebs_format >= 3 &&
+		event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+	if (has_branch_stack(event))
+		data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	/*
+	 * fmt0 does not have a status bitfield (does not use
+	 * perf_record_nhm format)
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 1)
+		return base;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+			/* PEBS v3 has accurate status bits */
+			if (x86_pmu.intel_cap.pebs_format >= 3)
+				return at;
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
+{
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+		return;
+
+	while (count > 1) {
+		setup_pebs_sample_data(event, iregs, at, &data, &regs);
+		perf_event_output(event, &data, &regs);
+		at += x86_pmu.pebs_record_size;
+		at = get_next_pebs_record_by_bit(at, top, bit);
+		count--;
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
+		x86_pmu_stop(event, 0);
+		return;
+	}
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+	struct pebs_record_core *at, *top;
+	int n;
+
+	if (!x86_pmu.pebs_active)
+		return;
+
+	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+	/*
+	 * Whatever else happens, drain the thing
+	 */
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!test_bit(0, cpuc->active_mask))
+		return;
+
+	WARN_ON_ONCE(!event);
+
+	if (!event->attr.precise_ip)
+		return;
+
+	n = top - at;
+	if (n <= 0)
+		return;
+
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event;
+	void *base, *at, *top;
+	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
+	int bit, i;
+
+	if (!x86_pmu.pebs_active)
+		return;
+
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (unlikely(base >= top))
+		return;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+		u64 pebs_status;
+
+		/* PEBS v3 has accurate status bits */
+		if (x86_pmu.intel_cap.pebs_format >= 3) {
+			for_each_set_bit(bit, (unsigned long *)&p->status,
+					 MAX_PEBS_EVENTS)
+				counts[bit]++;
+
+			continue;
+		}
+
+		pebs_status = p->status & cpuc->pebs_enabled;
+		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+		/*
+		 * On some CPUs the PEBS status can be zero when PEBS is
+		 * racing with clearing of GLOBAL_STATUS.
+		 *
+		 * Normally we would drop that record, but in the
+		 * case when there is only a single active PEBS event
+		 * we can assume it's for that event.
+		 */
+		if (!pebs_status && cpuc->pebs_enabled &&
+			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+			pebs_status = cpuc->pebs_enabled;
+
+		bit = find_first_bit((unsigned long *)&pebs_status,
+					x86_pmu.max_pebs_events);
+		if (bit >= x86_pmu.max_pebs_events)
+			continue;
+
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 */
+		if (p->status != (1ULL << bit)) {
+			for_each_set_bit(i, (unsigned long *)&pebs_status,
+					 x86_pmu.max_pebs_events)
+				error[i]++;
+			continue;
+		}
+
+		counts[bit]++;
+	}
+
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if ((counts[bit] == 0) && (error[bit] == 0))
+			continue;
+
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
+
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
+	}
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+	/*
+	 * No support for 32bit formats
+	 */
+	if (!boot_cpu_has(X86_FEATURE_DTES64))
+		return;
+
+	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	if (x86_pmu.pebs) {
+		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		int format = x86_pmu.intel_cap.pebs_format;
+
+		switch (format) {
+		case 0:
+			pr_cont("PEBS fmt0%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+			break;
+
+		case 1:
+			pr_cont("PEBS fmt1%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			break;
+
+		case 2:
+			pr_cont("PEBS fmt2%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			break;
+
+		case 3:
+			pr_cont("PEBS fmt3%c, ", pebs_type);
+			x86_pmu.pebs_record_size =
+						sizeof(struct pebs_record_skl);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+			break;
+
+		default:
+			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+			x86_pmu.pebs = 0;
+		}
+	}
+}
+
+void perf_restore_debug_store(void)
+{
+	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1bc881bd91ea..035bdb664d71 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
deleted file mode 100644
index 7c79261ed939..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,1368 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
-#define PEBS_FIXUP_SIZE		PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
-	u32 flags, ip;
-	u32 ax, bc, cx, dx;
-	u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
-	u64 val;
-	struct {
-		unsigned int ld_dse:4;
-		unsigned int ld_stlb_miss:1;
-		unsigned int ld_locked:1;
-		unsigned int ld_reserved:26;
-	};
-	struct {
-		unsigned int st_l1d_hit:1;
-		unsigned int st_reserved1:3;
-		unsigned int st_stlb_miss:1;
-		unsigned int st_locked:1;
-		unsigned int st_reserved2:26;
-	};
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
-	union intel_x86_pebs_dse dse;
-	u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
-	dse.val = status;
-
-	/*
-	 * bit 4: TLB access
-	 * 1 = stored missed 2nd level TLB
-	 *
-	 * so it either hit the walker or the OS
-	 * otherwise hit 2nd level TLB
-	 */
-	if (dse.st_stlb_miss)
-		val |= P(TLB, MISS);
-	else
-		val |= P(TLB, HIT);
-
-	/*
-	 * bit 0: hit L1 data cache
-	 * if not set, then all we know is that
-	 * it missed L1D
-	 */
-	if (dse.st_l1d_hit)
-		val |= P(LVL, HIT);
-	else
-		val |= P(LVL, MISS);
-
-	/*
-	 * bit 5: Locked prefix
-	 */
-	if (dse.st_locked)
-		val |= P(LOCK, LOCKED);
-
-	return val;
-}
-
-static u64 precise_datala_hsw(struct perf_event *event, u64 status)
-{
-	union perf_mem_data_src dse;
-
-	dse.val = PERF_MEM_NA;
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-		dse.mem_op = PERF_MEM_OP_STORE;
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
-		dse.mem_op = PERF_MEM_OP_LOAD;
-
-	/*
-	 * L1 info only valid for following events:
-	 *
-	 * MEM_UOPS_RETIRED.STLB_MISS_STORES
-	 * MEM_UOPS_RETIRED.LOCK_STORES
-	 * MEM_UOPS_RETIRED.SPLIT_STORES
-	 * MEM_UOPS_RETIRED.ALL_STORES
-	 */
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
-		if (status & 1)
-			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-		else
-			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-	}
-	return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
-	union intel_x86_pebs_dse dse;
-	u64 val;
-	int model = boot_cpu_data.x86_model;
-	int fam = boot_cpu_data.x86;
-
-	dse.val = status;
-
-	/*
-	 * use the mapping table for bit 0-3
-	 */
-	val = pebs_data_source[dse.ld_dse];
-
-	/*
-	 * Nehalem models do not support TLB, Lock infos
-	 */
-	if (fam == 0x6 && (model == 26 || model == 30
-	    || model == 31 || model == 46)) {
-		val |= P(TLB, NA) | P(LOCK, NA);
-		return val;
-	}
-	/*
-	 * bit 4: TLB access
-	 * 0 = did not miss 2nd level TLB
-	 * 1 = missed 2nd level TLB
-	 */
-	if (dse.ld_stlb_miss)
-		val |= P(TLB, MISS) | P(TLB, L2);
-	else
-		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
-	/*
-	 * bit 5: locked prefix
-	 */
-	if (dse.ld_locked)
-		val |= P(LOCK, LOCKED);
-
-	return val;
-}
-
-struct pebs_record_core {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-	u64 real_ip, tsx_tuning;
-};
-
-union hsw_tsx_tuning {
-	struct {
-		u32 cycles_last_block     : 32,
-		    hle_abort		  : 1,
-		    rtm_abort		  : 1,
-		    instruction_abort     : 1,
-		    non_instruction_abort : 1,
-		    retry		  : 1,
-		    data_conflict	  : 1,
-		    capacity_writes	  : 1,
-		    capacity_reads	  : 1;
-	};
-	u64	    value;
-};
-
-#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
-
-/* Same as HSW, plus TSC */
-
-struct pebs_record_skl {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-	u64 real_ip, tsx_tuning;
-	u64 tsc;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_events, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static DEFINE_PER_CPU(void *, insn_buffer);
-
-static int alloc_pebs_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
-
-	if (!x86_pmu.pebs)
-		return 0;
-
-	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
-	if (unlikely(!buffer))
-		return -ENOMEM;
-
-	/*
-	 * HSW+ already provides us the eventing ip; no need to allocate this
-	 * buffer then.
-	 */
-	if (x86_pmu.intel_cap.pebs_format < 2) {
-		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-		if (!ibuffer) {
-			kfree(buffer);
-			return -ENOMEM;
-		}
-		per_cpu(insn_buffer, cpu) = ibuffer;
-	}
-
-	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
-	return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds || !x86_pmu.pebs)
-		return;
-
-	kfree(per_cpu(insn_buffer, cpu));
-	per_cpu(insn_buffer, cpu) = NULL;
-
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
-	ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
-
-	if (!x86_pmu.bts)
-		return 0;
-
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
-	if (unlikely(!buffer)) {
-		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
-		return -ENOMEM;
-	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
-	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
-	return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds || !x86_pmu.bts)
-		return;
-
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
-	ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
-
-	per_cpu(cpu_hw_events, cpu).ds = ds;
-
-	return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
-	int cpu;
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		release_pebs_buffer(cpu);
-		release_bts_buffer(cpu);
-		release_ds_buffer(cpu);
-	}
-	put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
-	int bts_err = 0, pebs_err = 0;
-	int cpu;
-
-	x86_pmu.bts_active = 0;
-	x86_pmu.pebs_active = 0;
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	if (!x86_pmu.bts)
-		bts_err = 1;
-
-	if (!x86_pmu.pebs)
-		pebs_err = 1;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		if (alloc_ds_buffer(cpu)) {
-			bts_err = 1;
-			pebs_err = 1;
-		}
-
-		if (!bts_err && alloc_bts_buffer(cpu))
-			bts_err = 1;
-
-		if (!pebs_err && alloc_pebs_buffer(cpu))
-			pebs_err = 1;
-
-		if (bts_err && pebs_err)
-			break;
-	}
-
-	if (bts_err) {
-		for_each_possible_cpu(cpu)
-			release_bts_buffer(cpu);
-	}
-
-	if (pebs_err) {
-		for_each_possible_cpu(cpu)
-			release_pebs_buffer(cpu);
-	}
-
-	if (bts_err && pebs_err) {
-		for_each_possible_cpu(cpu)
-			release_ds_buffer(cpu);
-	} else {
-		if (x86_pmu.bts && !bts_err)
-			x86_pmu.bts_active = 1;
-
-		if (x86_pmu.pebs && !pebs_err)
-			x86_pmu.pebs_active = 1;
-
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= DEBUGCTLMSR_TR;
-	debugctlmsr |= DEBUGCTLMSR_BTS;
-	if (config & ARCH_PERFMON_EVENTSEL_INT)
-		debugctlmsr |= DEBUGCTLMSR_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
-		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *base, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	unsigned long skip = 0;
-	struct pt_regs regs;
-
-	if (!event)
-		return 0;
-
-	if (!x86_pmu.bts_active)
-		return 0;
-
-	base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top  = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= base)
-		return 0;
-
-	memset(&regs, 0, sizeof(regs));
-
-	ds->bts_index = ds->bts_buffer_base;
-
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-
-	/*
-	 * BTS leaks kernel addresses in branches across the cpl boundary,
-	 * such as traps or system calls, so unless the user is asking for
-	 * kernel tracing (and right now it's not possible), we'd need to
-	 * filter them out. But first we need to count how many of those we
-	 * have in the current batch. This is an extra O(n) pass, however,
-	 * it's much faster than the other one especially considering that
-	 * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
-	 * alloc_bts_buffer()).
-	 */
-	for (at = base; at < top; at++) {
-		/*
-		 * Note that right now *this* BTS code only works if
-		 * attr::exclude_kernel is set, but let's keep this extra
-		 * check here in case that changes.
-		 */
-		if (event->attr.exclude_kernel &&
-		    (kernel_ip(at->from) || kernel_ip(at->to)))
-			skip++;
-	}
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, event, &regs);
-
-	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
-		return 1;
-
-	for (at = base; at < top; at++) {
-		/* Filter out any records that contain kernel addresses. */
-		if (event->attr.exclude_kernel &&
-		    (kernel_ip(at->from) || kernel_ip(at->to)))
-			continue;
-
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, event);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	event->hw.interrupts++;
-	event->pending_kill = POLL_IN;
-	return 1;
-}
-
-static inline void intel_pmu_drain_pebs_buffer(void)
-{
-	struct pt_regs regs;
-
-	x86_pmu.drain_pebs(&regs);
-}
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	if (!sched_in)
-		intel_pmu_drain_pebs_buffer();
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_slm_pebs_event_constraints[] = {
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
-	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
-	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
-	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
-        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-        EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	if (!event->attr.precise_ip)
-		return NULL;
-
-	if (x86_pmu.pebs_constraints) {
-		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
-				event->hw.flags |= c->flags;
-				return c;
-			}
-		}
-	}
-
-	return &emptyconstraint;
-}
-
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
-{
-	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool first_pebs;
-	u64 threshold;
-
-	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
-	first_pebs = !pebs_is_enabled(cpuc);
-	cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-		cpuc->pebs_enabled |= 1ULL << 63;
-
-	/*
-	 * When the event is constrained enough we can use a larger
-	 * threshold and run the event with less frequent PMI.
-	 */
-	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-		threshold = ds->pebs_absolute_maximum -
-			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-		if (first_pebs)
-			perf_sched_cb_inc(event->ctx->pmu);
-	} else {
-		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-		/*
-		 * If not all events can use larger buffer,
-		 * roll back to threshold = 1
-		 */
-		if (!first_pebs &&
-		    (ds->pebs_interrupt_threshold > threshold))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
-
-	/* Use auto-reload if possible to save a MSR write in the PMI */
-	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-		ds->pebs_event_reset[hwc->idx] =
-			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
-	}
-
-	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-		ds->pebs_interrupt_threshold = threshold;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool large_pebs = ds->pebs_interrupt_threshold >
-		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-	if (large_pebs)
-		intel_pmu_drain_pebs_buffer();
-
-	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-		cpuc->pebs_enabled &= ~(1ULL << 63);
-
-	if (large_pebs && !pebs_is_enabled(cpuc))
-		perf_sched_cb_dec(event->ctx->pmu);
-
-	if (cpuc->enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
-	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned long from = cpuc->lbr_entries[0].from;
-	unsigned long old_to, to = cpuc->lbr_entries[0].to;
-	unsigned long ip = regs->ip;
-	int is_64bit = 0;
-	void *kaddr;
-	int size;
-
-	/*
-	 * We don't need to fixup if the PEBS assist is fault like
-	 */
-	if (!x86_pmu.intel_cap.pebs_trap)
-		return 1;
-
-	/*
-	 * No LBR entry, no basic block, no rewinding
-	 */
-	if (!cpuc->lbr_stack.nr || !from || !to)
-		return 0;
-
-	/*
-	 * Basic blocks should never cross user/kernel boundaries
-	 */
-	if (kernel_ip(ip) != kernel_ip(to))
-		return 0;
-
-	/*
-	 * unsigned math, either ip is before the start (impossible) or
-	 * the basic block is larger than 1 page (sanity)
-	 */
-	if ((ip - to) > PEBS_FIXUP_SIZE)
-		return 0;
-
-	/*
-	 * We sampled a branch insn, rewind using the LBR stack
-	 */
-	if (ip == to) {
-		set_linear_ip(regs, from);
-		return 1;
-	}
-
-	size = ip - to;
-	if (!kernel_ip(ip)) {
-		int bytes;
-		u8 *buf = this_cpu_read(insn_buffer);
-
-		/* 'size' must fit our buffer, see above */
-		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-		if (bytes != 0)
-			return 0;
-
-		kaddr = buf;
-	} else {
-		kaddr = (void *)to;
-	}
-
-	do {
-		struct insn insn;
-
-		old_to = to;
-
-#ifdef CONFIG_X86_64
-		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
-		insn_init(&insn, kaddr, size, is_64bit);
-		insn_get_length(&insn);
-		/*
-		 * Make sure there was not a problem decoding the
-		 * instruction and getting the length.  This is
-		 * doubly important because we have an infinite
-		 * loop if insn.length=0.
-		 */
-		if (!insn.length)
-			break;
-
-		to += insn.length;
-		kaddr += insn.length;
-		size -= insn.length;
-	} while (to < ip);
-
-	if (to == ip) {
-		set_linear_ip(regs, old_to);
-		return 1;
-	}
-
-	/*
-	 * Even though we decoded the basic block, the instruction stream
-	 * never matched the given IP, either the TO or the IP got corrupted.
-	 */
-	return 0;
-}
-
-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
-{
-	if (pebs->tsx_tuning) {
-		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
-		return tsx.cycles_last_block;
-	}
-	return 0;
-}
-
-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
-{
-	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
-
-	/* For RTM XABORTs also log the abort code from AX */
-	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
-	return txn;
-}
-
-static void setup_pebs_sample_data(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs,
-				   struct perf_sample_data *data,
-				   struct pt_regs *regs)
-{
-#define PERF_X86_EVENT_PEBS_HSW_PREC \
-		(PERF_X86_EVENT_PEBS_ST_HSW | \
-		 PERF_X86_EVENT_PEBS_LD_HSW | \
-		 PERF_X86_EVENT_PEBS_NA_HSW)
-	/*
-	 * We cast to the biggest pebs_record but are careful not to
-	 * unconditionally access the 'extra' entries.
-	 */
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct pebs_record_skl *pebs = __pebs;
-	u64 sample_type;
-	int fll, fst, dsrc;
-	int fl = event->hw.flags;
-
-	if (pebs == NULL)
-		return;
-
-	sample_type = event->attr.sample_type;
-	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
-
-	perf_sample_data_init(data, 0, event->hw.last_period);
-
-	data->period = event->hw.last_period;
-
-	/*
-	 * Use latency for weight (only avail with PEBS-LL)
-	 */
-	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data->weight = pebs->lat;
-
-	/*
-	 * data.data_src encodes the data source
-	 */
-	if (dsrc) {
-		u64 val = PERF_MEM_NA;
-		if (fll)
-			val = load_latency_data(pebs->dse);
-		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-			val = precise_datala_hsw(event, pebs->dse);
-		else if (fst)
-			val = precise_store_data(pebs->dse);
-		data->data_src.val = val;
-	}
-
-	/*
-	 * We use the interrupt regs as a base because the PEBS record
-	 * does not contain a full regs set, specifically it seems to
-	 * lack segment descriptors, which get used by things like
-	 * user_mode().
-	 *
-	 * In the simple case fix up only the IP and BP,SP regs, for
-	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-	 */
-	*regs = *iregs;
-	regs->flags = pebs->flags;
-	set_linear_ip(regs, pebs->ip);
-	regs->bp = pebs->bp;
-	regs->sp = pebs->sp;
-
-	if (sample_type & PERF_SAMPLE_REGS_INTR) {
-		regs->ax = pebs->ax;
-		regs->bx = pebs->bx;
-		regs->cx = pebs->cx;
-		regs->dx = pebs->dx;
-		regs->si = pebs->si;
-		regs->di = pebs->di;
-		regs->bp = pebs->bp;
-		regs->sp = pebs->sp;
-
-		regs->flags = pebs->flags;
-#ifndef CONFIG_X86_32
-		regs->r8 = pebs->r8;
-		regs->r9 = pebs->r9;
-		regs->r10 = pebs->r10;
-		regs->r11 = pebs->r11;
-		regs->r12 = pebs->r12;
-		regs->r13 = pebs->r13;
-		regs->r14 = pebs->r14;
-		regs->r15 = pebs->r15;
-#endif
-	}
-
-	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-		regs->ip = pebs->real_ip;
-		regs->flags |= PERF_EFLAGS_EXACT;
-	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
-		regs->flags |= PERF_EFLAGS_EXACT;
-	else
-		regs->flags &= ~PERF_EFLAGS_EXACT;
-
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
-	    x86_pmu.intel_cap.pebs_format >= 1)
-		data->addr = pebs->dla;
-
-	if (x86_pmu.intel_cap.pebs_format >= 2) {
-		/* Only set the TSX weight when no memory weight. */
-		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_hsw_weight(pebs);
-
-		if (sample_type & PERF_SAMPLE_TRANSACTION)
-			data->txn = intel_hsw_transaction(pebs);
-	}
-
-	/*
-	 * v3 supplies an accurate time stamp, so we use that
-	 * for the time stamp.
-	 *
-	 * We can only do this for the default trace clock.
-	 */
-	if (x86_pmu.intel_cap.pebs_format >= 3 &&
-		event->attr.use_clockid == 0)
-		data->time = native_sched_clock_from_tsc(pebs->tsc);
-
-	if (has_branch_stack(event))
-		data->br_stack = &cpuc->lbr_stack;
-}
-
-static inline void *
-get_next_pebs_record_by_bit(void *base, void *top, int bit)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	void *at;
-	u64 pebs_status;
-
-	/*
-	 * fmt0 does not have a status bitfield (does not use
-	 * perf_record_nhm format)
-	 */
-	if (x86_pmu.intel_cap.pebs_format < 1)
-		return base;
-
-	if (base == NULL)
-		return NULL;
-
-	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-		struct pebs_record_nhm *p = at;
-
-		if (test_bit(bit, (unsigned long *)&p->status)) {
-			/* PEBS v3 has accurate status bits */
-			if (x86_pmu.intel_cap.pebs_format >= 3)
-				return at;
-
-			if (p->status == (1 << bit))
-				return at;
-
-			/* clear non-PEBS bit and re-check */
-			pebs_status = p->status & cpuc->pebs_enabled;
-			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status == (1 << bit))
-				return at;
-		}
-	}
-	return NULL;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs,
-				   void *base, void *top,
-				   int bit, int count)
-{
-	struct perf_sample_data data;
-	struct pt_regs regs;
-	void *at = get_next_pebs_record_by_bit(base, top, bit);
-
-	if (!intel_pmu_save_and_restart(event) &&
-	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
-		return;
-
-	while (count > 1) {
-		setup_pebs_sample_data(event, iregs, at, &data, &regs);
-		perf_event_output(event, &data, &regs);
-		at += x86_pmu.pebs_record_size;
-		at = get_next_pebs_record_by_bit(at, top, bit);
-		count--;
-	}
-
-	setup_pebs_sample_data(event, iregs, at, &data, &regs);
-
-	/*
-	 * All but the last records are processed.
-	 * The last one is left to be able to call the overflow handler.
-	 */
-	if (perf_event_overflow(event, &data, &regs)) {
-		x86_pmu_stop(event, 0);
-		return;
-	}
-
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
-	struct pebs_record_core *at, *top;
-	int n;
-
-	if (!x86_pmu.pebs_active)
-		return;
-
-	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
-	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
-	/*
-	 * Whatever else happens, drain the thing
-	 */
-	ds->pebs_index = ds->pebs_buffer_base;
-
-	if (!test_bit(0, cpuc->active_mask))
-		return;
-
-	WARN_ON_ONCE(!event);
-
-	if (!event->attr.precise_ip)
-		return;
-
-	n = top - at;
-	if (n <= 0)
-		return;
-
-	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event;
-	void *base, *at, *top;
-	short counts[MAX_PEBS_EVENTS] = {};
-	short error[MAX_PEBS_EVENTS] = {};
-	int bit, i;
-
-	if (!x86_pmu.pebs_active)
-		return;
-
-	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-	ds->pebs_index = ds->pebs_buffer_base;
-
-	if (unlikely(base >= top))
-		return;
-
-	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-		struct pebs_record_nhm *p = at;
-		u64 pebs_status;
-
-		/* PEBS v3 has accurate status bits */
-		if (x86_pmu.intel_cap.pebs_format >= 3) {
-			for_each_set_bit(bit, (unsigned long *)&p->status,
-					 MAX_PEBS_EVENTS)
-				counts[bit]++;
-
-			continue;
-		}
-
-		pebs_status = p->status & cpuc->pebs_enabled;
-		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
-		/*
-		 * On some CPUs the PEBS status can be zero when PEBS is
-		 * racing with clearing of GLOBAL_STATUS.
-		 *
-		 * Normally we would drop that record, but in the
-		 * case when there is only a single active PEBS event
-		 * we can assume it's for that event.
-		 */
-		if (!pebs_status && cpuc->pebs_enabled &&
-			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-			pebs_status = cpuc->pebs_enabled;
-
-		bit = find_first_bit((unsigned long *)&pebs_status,
-					x86_pmu.max_pebs_events);
-		if (bit >= x86_pmu.max_pebs_events)
-			continue;
-
-		/*
-		 * The PEBS hardware does not deal well with the situation
-		 * when events happen near to each other and multiple bits
-		 * are set. But it should happen rarely.
-		 *
-		 * If these events include one PEBS and multiple non-PEBS
-		 * events, it doesn't impact PEBS record. The record will
-		 * be handled normally. (slow path)
-		 *
-		 * If these events include two or more PEBS events, the
-		 * records for the events can be collapsed into a single
-		 * one, and it's not possible to reconstruct all events
-		 * that caused the PEBS record. It's called collision.
-		 * If collision happened, the record will be dropped.
-		 */
-		if (p->status != (1ULL << bit)) {
-			for_each_set_bit(i, (unsigned long *)&pebs_status,
-					 x86_pmu.max_pebs_events)
-				error[i]++;
-			continue;
-		}
-
-		counts[bit]++;
-	}
-
-	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-		if ((counts[bit] == 0) && (error[bit] == 0))
-			continue;
-
-		event = cpuc->events[bit];
-		WARN_ON_ONCE(!event);
-		WARN_ON_ONCE(!event->attr.precise_ip);
-
-		/* log dropped samples number */
-		if (error[bit])
-			perf_log_lost_samples(event, error[bit]);
-
-		if (counts[bit]) {
-			__intel_pmu_pebs_event(event, iregs, base,
-					       top, bit, counts[bit]);
-		}
-	}
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void __init intel_ds_init(void)
-{
-	/*
-	 * No support for 32bit formats
-	 */
-	if (!boot_cpu_has(X86_FEATURE_DTES64))
-		return;
-
-	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
-	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
-	if (x86_pmu.pebs) {
-		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
-		int format = x86_pmu.intel_cap.pebs_format;
-
-		switch (format) {
-		case 0:
-			pr_cont("PEBS fmt0%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-			break;
-
-		case 1:
-			pr_cont("PEBS fmt1%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			break;
-
-		case 2:
-			pr_cont("PEBS fmt2%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			break;
-
-		case 3:
-			pr_cont("PEBS fmt3%c, ", pebs_type);
-			x86_pmu.pebs_record_size =
-						sizeof(struct pebs_record_skl);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
-			break;
-
-		default:
-			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
-			x86_pmu.pebs = 0;
-		}
-	}
-}
-
-void perf_restore_debug_store(void)
-{
-	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
-- 
cgit 


From c85cc4497f823b83379a23e798018d69fe566185 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:12 +0100
Subject: perf/x86: Move perf_event_intel_lbr.c ........ =>
 x86/events/intel/lbr.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                   |    2 +-
 arch/x86/events/intel/lbr.c                | 1062 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile               |    1 -
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 1062 ----------------------------
 4 files changed, 1063 insertions(+), 1064 deletions(-)
 create mode 100644 arch/x86/events/intel/lbr.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_lbr.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index c4d41b66d167..48097fd4aa71 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,4 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
new file mode 100644
index 000000000000..78c88f9293cc
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1062 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+};
+
+static enum {
+	LBR_EIP_FLAGS		= 1,
+	LBR_TSX			= 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
+#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT		2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT		5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
+#define LBR_FAR_BIT		8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
+#define LBR_USER	(1 << LBR_USER_BIT)
+#define LBR_JCC		(1 << LBR_JCC_BIT)
+#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN	(1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
+#define LBR_FAR		(1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
+#define LBR_IGN		0	/* ignored */
+
+#define LBR_ANY		 \
+	(LBR_JCC	|\
+	 LBR_REL_CALL	|\
+	 LBR_IND_CALL	|\
+	 LBR_RETURN	|\
+	 LBR_REL_JMP	|\
+	 LBR_IND_JMP	|\
+	 LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+	X86_BR_NONE		= 0,      /* unknown */
+
+	X86_BR_USER		= 1 << 0, /* branch target is user */
+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
+
+	X86_BR_CALL		= 1 << 2, /* call */
+	X86_BR_RET		= 1 << 3, /* return */
+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
+	X86_BR_INT		= 1 << 6, /* sw interrupt */
+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
+	X86_BR_JCC		= 1 << 8, /* conditional */
+	X86_BR_JMP		= 1 << 9, /* jump */
+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
+	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY       \
+	(X86_BR_CALL    |\
+	 X86_BR_RET     |\
+	 X86_BR_SYSCALL |\
+	 X86_BR_SYSRET  |\
+	 X86_BR_INT     |\
+	 X86_BR_IRET    |\
+	 X86_BR_JCC     |\
+	 X86_BR_JMP	 |\
+	 X86_BR_IRQ	 |\
+	 X86_BR_ABORT	 |\
+	 X86_BR_IND_CALL |\
+	 X86_BR_IND_JMP  |\
+	 X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL		 \
+	(X86_BR_CALL		|\
+	 X86_BR_IND_CALL	|\
+	 X86_BR_ZERO_CALL	|\
+	 X86_BR_SYSCALL		|\
+	 X86_BR_IRQ		|\
+	 X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 debugctl, lbr_select = 0, orig_debugctl;
+
+	/*
+	 * No need to unfreeze manually, as v4 can do that as part
+	 * of the GLOBAL_STATUS ack.
+	 */
+	if (pmi && x86_pmu.version >= 4)
+		return;
+
+	/*
+	 * No need to reprogram LBR_SELECT in a PMI, as it
+	 * did not change.
+	 */
+	if (cpuc->lbr_sel)
+		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+	if (!pmi && cpuc->lbr_sel)
+		wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	orig_debugctl = debugctl;
+	debugctl |= DEBUGCTLMSR_LBR;
+	/*
+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+	 * may cause superfluous increase/decrease of LBR_TOS.
+	 */
+	if (!(lbr_select & LBR_CALL_STACK))
+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	if (orig_debugctl != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++)
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+		wrmsrl(x86_pmu.lbr_to   + i, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, 0);
+	}
+}
+
+void intel_pmu_lbr_reset(void)
+{
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_reset_32();
+	else
+		intel_pmu_lbr_reset_64();
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+	return tos;
+}
+
+enum {
+	LBR_NONE,
+	LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = task_ctx->tos;
+	for (i = 0; i < tos; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+	}
+	wrmsrl(x86_pmu.lbr_tos, tos);
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < tos; i++) {
+		lbr_idx = (tos - i) & mask;
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+	}
+	task_ctx->tos = tos;
+	task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	/*
+	 * If LBR callstack feature is enabled and the stack was saved when
+	 * the task was scheduled out, restore the stack. Otherwise flush
+	 * the LBR stack.
+	 */
+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
+	if (task_ctx) {
+		if (sched_in) {
+			__intel_pmu_lbr_restore(task_ctx);
+			cpuc->lbr_context = ctx;
+		} else {
+			__intel_pmu_lbr_save(task_ctx);
+		}
+		return;
+	}
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * Reset the LBR stack if we changed task context to
+	 * avoid data leaks.
+	 */
+	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = event->ctx;
+	}
+	cpuc->br_sel = event->hw.branch_reg.reg;
+
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users++;
+	}
+
+	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users--;
+	}
+
+	cpuc->lbr_users--;
+	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (cpuc->enabled && !cpuc->lbr_users) {
+		__intel_pmu_lbr_disable();
+		/* avoid stale pointer */
+		cpuc->lbr_context = NULL;
+	}
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		union {
+			struct {
+				u32 from;
+				u32 to;
+			};
+			u64     lbr;
+		} msr_lastbranch;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
+		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
+		cpuc->lbr_entries[i].mispred	= 0;
+		cpuc->lbr_entries[i].predicted	= 0;
+		cpuc->lbr_entries[i].reserved	= 0;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+	bool need_info = false;
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	int lbr_format = x86_pmu.intel_cap.lbr_format;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+	int out = 0;
+	int num = x86_pmu.lbr_nr;
+
+	if (cpuc->lbr_sel) {
+		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+			num = tos;
+	}
+
+	for (i = 0; i < num; i++) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+		int skip = 0;
+		u16 cycles = 0;
+		int lbr_flags = lbr_desc[lbr_format];
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+		if (lbr_format == LBR_FORMAT_INFO && need_info) {
+			u64 info;
+
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			mis = !!(info & LBR_INFO_MISPRED);
+			pred = !mis;
+			in_tx = !!(info & LBR_INFO_IN_TX);
+			abort = !!(info & LBR_INFO_ABORT);
+			cycles = (info & LBR_INFO_CYCLES);
+		}
+		if (lbr_flags & LBR_EIP_FLAGS) {
+			mis = !!(from & LBR_FROM_FLAG_MISPRED);
+			pred = !mis;
+			skip = 1;
+		}
+		if (lbr_flags & LBR_TSX) {
+			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+			abort = !!(from & LBR_FROM_FLAG_ABORT);
+			skip = 3;
+		}
+		from = (u64)((((s64)from) << skip) >> skip);
+
+		/*
+		 * Some CPUs report duplicated abort records,
+		 * with the second entry not having an abort bit set.
+		 * Skip them here. This loop runs backwards,
+		 * so we need to undo the previous record.
+		 * If the abort just happened outside the window
+		 * the extra entry cannot be removed.
+		 */
+		if (abort && x86_pmu.lbr_double_abort && out > 0)
+			out--;
+
+		cpuc->lbr_entries[out].from	 = from;
+		cpuc->lbr_entries[out].to	 = to;
+		cpuc->lbr_entries[out].mispred	 = mis;
+		cpuc->lbr_entries[out].predicted = pred;
+		cpuc->lbr_entries[out].in_tx	 = in_tx;
+		cpuc->lbr_entries[out].abort	 = abort;
+		cpuc->lbr_entries[out].cycles	 = cycles;
+		cpuc->lbr_entries[out].reserved	 = 0;
+		out++;
+	}
+	cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!cpuc->lbr_users)
+		return;
+
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_read_32(cpuc);
+	else
+		intel_pmu_lbr_read_64(cpuc);
+
+	intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+	u64 br_type = event->attr.branch_sample_type;
+	int mask = 0;
+
+	if (br_type & PERF_SAMPLE_BRANCH_USER)
+		mask |= X86_BR_USER;
+
+	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+		mask |= X86_BR_KERNEL;
+
+	/* we ignore BRANCH_HV here */
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY)
+		mask |= X86_BR_ANY;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+		mask |= X86_BR_ANY_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+		mask |= X86_BR_IND_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+		mask |= X86_BR_ABORT;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+		mask |= X86_BR_IN_TX;
+
+	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+		mask |= X86_BR_NO_TX;
+
+	if (br_type & PERF_SAMPLE_BRANCH_COND)
+		mask |= X86_BR_JCC;
+
+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+		if (!x86_pmu_has_lbr_callstack())
+			return -EOPNOTSUPP;
+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+			return -EINVAL;
+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+			X86_BR_CALL_STACK;
+	}
+
+	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		mask |= X86_BR_IND_JMP;
+
+	if (br_type & PERF_SAMPLE_BRANCH_CALL)
+		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+	/*
+	 * stash actual user request into reg, it may
+	 * be used by fixup code for some CPU
+	 */
+	event->hw.branch_reg.reg = mask;
+	return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+	u64 br_type = event->attr.branch_sample_type;
+	u64 mask = 0, v;
+	int i;
+
+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+		if (!(br_type & (1ULL << i)))
+			continue;
+
+		v = x86_pmu.lbr_sel_map[i];
+		if (v == LBR_NOT_SUPP)
+			return -EOPNOTSUPP;
+
+		if (v != LBR_IGN)
+			mask |= v;
+	}
+
+	reg = &event->hw.branch_reg;
+	reg->idx = EXTRA_REG_LBR;
+
+	/*
+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+	 * in suppress mode. So LBR_SELECT should be set to
+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 */
+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
+
+	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+		reg->config |= LBR_NO_INFO;
+
+	return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+	int ret = 0;
+
+	/*
+	 * no LBR on this PMU
+	 */
+	if (!x86_pmu.lbr_nr)
+		return -EOPNOTSUPP;
+
+	/*
+	 * setup SW LBR filter
+	 */
+	ret = intel_pmu_setup_sw_lbr_filter(event);
+	if (ret)
+		return ret;
+
+	/*
+	 * setup HW LBR filter, if any
+	 */
+	if (x86_pmu.lbr_sel_map)
+		ret = intel_pmu_setup_hw_lbr_filter(event);
+
+	return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+	struct insn insn;
+	void *addr;
+	int bytes_read, bytes_left;
+	int ret = X86_BR_NONE;
+	int ext, to_plm, from_plm;
+	u8 buf[MAX_INSN_SIZE];
+	int is64 = 0;
+
+	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+	/*
+	 * maybe zero if lbr did not fill up after a reset by the time
+	 * we get a PMU interrupt
+	 */
+	if (from == 0 || to == 0)
+		return X86_BR_NONE;
+
+	if (abort)
+		return X86_BR_ABORT | to_plm;
+
+	if (from_plm == X86_BR_USER) {
+		/*
+		 * can happen if measuring at the user level only
+		 * and we interrupt in a kernel thread, e.g., idle.
+		 */
+		if (!current->mm)
+			return X86_BR_NONE;
+
+		/* may fail if text not present */
+		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+						MAX_INSN_SIZE);
+		bytes_read = MAX_INSN_SIZE - bytes_left;
+		if (!bytes_read)
+			return X86_BR_NONE;
+
+		addr = buf;
+	} else {
+		/*
+		 * The LBR logs any address in the IP, even if the IP just
+		 * faulted. This means userspace can control the from address.
+		 * Ensure we don't blindy read any address by validating it is
+		 * a known text address.
+		 */
+		if (kernel_text_address(from)) {
+			addr = (void *)from;
+			/*
+			 * Assume we can get the maximum possible size
+			 * when grabbing kernel data.  This is not
+			 * _strictly_ true since we could possibly be
+			 * executing up next to a memory hole, but
+			 * it is very unlikely to be a problem.
+			 */
+			bytes_read = MAX_INSN_SIZE;
+		} else {
+			return X86_BR_NONE;
+		}
+	}
+
+	/*
+	 * decoder needs to know the ABI especially
+	 * on 64-bit systems running 32-bit apps
+	 */
+#ifdef CONFIG_X86_64
+	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+	insn_init(&insn, addr, bytes_read, is64);
+	insn_get_opcode(&insn);
+	if (!insn.opcode.got)
+		return X86_BR_ABORT;
+
+	switch (insn.opcode.bytes[0]) {
+	case 0xf:
+		switch (insn.opcode.bytes[1]) {
+		case 0x05: /* syscall */
+		case 0x34: /* sysenter */
+			ret = X86_BR_SYSCALL;
+			break;
+		case 0x07: /* sysret */
+		case 0x35: /* sysexit */
+			ret = X86_BR_SYSRET;
+			break;
+		case 0x80 ... 0x8f: /* conditional */
+			ret = X86_BR_JCC;
+			break;
+		default:
+			ret = X86_BR_NONE;
+		}
+		break;
+	case 0x70 ... 0x7f: /* conditional */
+		ret = X86_BR_JCC;
+		break;
+	case 0xc2: /* near ret */
+	case 0xc3: /* near ret */
+	case 0xca: /* far ret */
+	case 0xcb: /* far ret */
+		ret = X86_BR_RET;
+		break;
+	case 0xcf: /* iret */
+		ret = X86_BR_IRET;
+		break;
+	case 0xcc ... 0xce: /* int */
+		ret = X86_BR_INT;
+		break;
+	case 0xe8: /* call near rel */
+		insn_get_immediate(&insn);
+		if (insn.immediate1.value == 0) {
+			/* zero length call */
+			ret = X86_BR_ZERO_CALL;
+			break;
+		}
+	case 0x9a: /* call far absolute */
+		ret = X86_BR_CALL;
+		break;
+	case 0xe0 ... 0xe3: /* loop jmp */
+		ret = X86_BR_JCC;
+		break;
+	case 0xe9 ... 0xeb: /* jmp */
+		ret = X86_BR_JMP;
+		break;
+	case 0xff: /* call near absolute, call far absolute ind */
+		insn_get_modrm(&insn);
+		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+		switch (ext) {
+		case 2: /* near ind call */
+		case 3: /* far ind call */
+			ret = X86_BR_IND_CALL;
+			break;
+		case 4:
+		case 5:
+			ret = X86_BR_IND_JMP;
+			break;
+		}
+		break;
+	default:
+		ret = X86_BR_NONE;
+	}
+	/*
+	 * interrupts, traps, faults (and thus ring transition) may
+	 * occur on any instructions. Thus, to classify them correctly,
+	 * we need to first look at the from and to priv levels. If they
+	 * are different and to is in the kernel, then it indicates
+	 * a ring transition. If the from instruction is not a ring
+	 * transition instr (syscall, systenter, int), then it means
+	 * it was a irq, trap or fault.
+	 *
+	 * we have no way of detecting kernel to kernel faults.
+	 */
+	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+		ret = X86_BR_IRQ;
+
+	/*
+	 * branch priv level determined by target as
+	 * is done by HW when LBR_SELECT is implemented
+	 */
+	if (ret != X86_BR_NONE)
+		ret |= to_plm;
+
+	return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+	u64 from, to;
+	int br_sel = cpuc->br_sel;
+	int i, j, type;
+	bool compress = false;
+
+	/* if sampling all branches, then nothing to filter */
+	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+		return;
+
+	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+		from = cpuc->lbr_entries[i].from;
+		to = cpuc->lbr_entries[i].to;
+
+		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+			if (cpuc->lbr_entries[i].in_tx)
+				type |= X86_BR_IN_TX;
+			else
+				type |= X86_BR_NO_TX;
+		}
+
+		/* if type does not correspond, then discard */
+		if (type == X86_BR_NONE || (br_sel & type) != type) {
+			cpuc->lbr_entries[i].from = 0;
+			compress = true;
+		}
+	}
+
+	if (!compress)
+		return;
+
+	/* remove all entries with from=0 */
+	for (i = 0; i < cpuc->lbr_stack.nr; ) {
+		if (!cpuc->lbr_entries[i].from) {
+			j = i;
+			while (++j < cpuc->lbr_stack.nr)
+				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+			cpuc->lbr_stack.nr--;
+			if (!cpuc->lbr_entries[i].from)
+				continue;
+		}
+		i++;
+	}
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
+						| LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+	 */
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+	 */
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_RETURN | LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+	x86_pmu.lbr_nr     = 4;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
+	pr_cont("4-deep LBR, ");
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+	x86_pmu.lbr_nr     = 16;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - workaround LBR_SEL errata (see above)
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("16-deep LBR, ");
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+	x86_pmu.lbr_nr	 = 32;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("32-deep LBR, ");
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+	/*
+	 * only models starting at stepping 10 seems
+	 * to have an operational LBR which can freeze
+	 * on PMU interrupt
+	 */
+	if (boot_cpu_data.x86_model == 28
+	    && boot_cpu_data.x86_mask < 10) {
+		pr_cont("LBR disabled due to erratum");
+		return;
+	}
+
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
+	pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 035bdb664d71..d52ed9b918b3 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
deleted file mode 100644
index 653f88d25987..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
-	LBR_FORMAT_32		= 0x00,
-	LBR_FORMAT_LIP		= 0x01,
-	LBR_FORMAT_EIP		= 0x02,
-	LBR_FORMAT_EIP_FLAGS	= 0x03,
-	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
-};
-
-static enum {
-	LBR_EIP_FLAGS		= 1,
-	LBR_TSX			= 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
-	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
-	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
-#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT		2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT		5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
-#define LBR_FAR_BIT		8 /* do not capture far branches */
-#define LBR_CALL_STACK_BIT	9 /* enable call stack */
-
-/*
- * Following bit only exists in Linux; we mask it out before writing it to
- * the actual MSR. But it helps the constraint perf code to understand
- * that this is a separate configuration.
- */
-#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
-
-#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
-#define LBR_USER	(1 << LBR_USER_BIT)
-#define LBR_JCC		(1 << LBR_JCC_BIT)
-#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN	(1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
-#define LBR_FAR		(1 << LBR_FAR_BIT)
-#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
-#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
-#define LBR_IGN		0	/* ignored */
-
-#define LBR_ANY		 \
-	(LBR_JCC	|\
-	 LBR_REL_CALL	|\
-	 LBR_IND_CALL	|\
-	 LBR_RETURN	|\
-	 LBR_REL_JMP	|\
-	 LBR_IND_JMP	|\
-	 LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
-	X86_BR_NONE		= 0,      /* unknown */
-
-	X86_BR_USER		= 1 << 0, /* branch target is user */
-	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
-
-	X86_BR_CALL		= 1 << 2, /* call */
-	X86_BR_RET		= 1 << 3, /* return */
-	X86_BR_SYSCALL		= 1 << 4, /* syscall */
-	X86_BR_SYSRET		= 1 << 5, /* syscall return */
-	X86_BR_INT		= 1 << 6, /* sw interrupt */
-	X86_BR_IRET		= 1 << 7, /* return from interrupt */
-	X86_BR_JCC		= 1 << 8, /* conditional */
-	X86_BR_JMP		= 1 << 9, /* jump */
-	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
-	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
-	X86_BR_ABORT		= 1 << 12,/* transaction abort */
-	X86_BR_IN_TX		= 1 << 13,/* in transaction */
-	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
-	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
-	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
-	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY       \
-	(X86_BR_CALL    |\
-	 X86_BR_RET     |\
-	 X86_BR_SYSCALL |\
-	 X86_BR_SYSRET  |\
-	 X86_BR_INT     |\
-	 X86_BR_IRET    |\
-	 X86_BR_JCC     |\
-	 X86_BR_JMP	 |\
-	 X86_BR_IRQ	 |\
-	 X86_BR_ABORT	 |\
-	 X86_BR_IND_CALL |\
-	 X86_BR_IND_JMP  |\
-	 X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL		 \
-	(X86_BR_CALL		|\
-	 X86_BR_IND_CALL	|\
-	 X86_BR_ZERO_CALL	|\
-	 X86_BR_SYSCALL		|\
-	 X86_BR_IRQ		|\
-	 X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 debugctl, lbr_select = 0, orig_debugctl;
-
-	/*
-	 * No need to unfreeze manually, as v4 can do that as part
-	 * of the GLOBAL_STATUS ack.
-	 */
-	if (pmi && x86_pmu.version >= 4)
-		return;
-
-	/*
-	 * No need to reprogram LBR_SELECT in a PMI, as it
-	 * did not change.
-	 */
-	if (cpuc->lbr_sel)
-		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi && cpuc->lbr_sel)
-		wrmsrl(MSR_LBR_SELECT, lbr_select);
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	orig_debugctl = debugctl;
-	debugctl |= DEBUGCTLMSR_LBR;
-	/*
-	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
-	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
-	 * may cause superfluous increase/decrease of LBR_TOS.
-	 */
-	if (!(lbr_select & LBR_CALL_STACK))
-		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-	if (orig_debugctl != debugctl)
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
-	u64 debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++)
-		wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		wrmsrl(x86_pmu.lbr_from + i, 0);
-		wrmsrl(x86_pmu.lbr_to   + i, 0);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, 0);
-	}
-}
-
-void intel_pmu_lbr_reset(void)
-{
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_reset_32();
-	else
-		intel_pmu_lbr_reset_64();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-	u64 tos;
-
-	rdmsrl(x86_pmu.lbr_tos, tos);
-	return tos;
-}
-
-enum {
-	LBR_NONE,
-	LBR_VALID,
-};
-
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
-{
-	int i;
-	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
-		intel_pmu_lbr_reset();
-		return;
-	}
-
-	mask = x86_pmu.lbr_nr - 1;
-	tos = task_ctx->tos;
-	for (i = 0; i < tos; i++) {
-		lbr_idx = (tos - i) & mask;
-		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-	}
-	wrmsrl(x86_pmu.lbr_tos, tos);
-	task_ctx->lbr_stack_state = LBR_NONE;
-}
-
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
-{
-	int i;
-	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
-		return;
-	}
-
-	mask = x86_pmu.lbr_nr - 1;
-	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < tos; i++) {
-		lbr_idx = (tos - i) & mask;
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-	}
-	task_ctx->tos = tos;
-	task_ctx->lbr_stack_state = LBR_VALID;
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	/*
-	 * If LBR callstack feature is enabled and the stack was saved when
-	 * the task was scheduled out, restore the stack. Otherwise flush
-	 * the LBR stack.
-	 */
-	task_ctx = ctx ? ctx->task_ctx_data : NULL;
-	if (task_ctx) {
-		if (sched_in) {
-			__intel_pmu_lbr_restore(task_ctx);
-			cpuc->lbr_context = ctx;
-		} else {
-			__intel_pmu_lbr_save(task_ctx);
-		}
-		return;
-	}
-
-	/*
-	 * When sampling the branck stack in system-wide, it may be
-	 * necessary to flush the stack on context switch. This happens
-	 * when the branch stack does not tag its entries with the pid
-	 * of the current task. Otherwise it becomes impossible to
-	 * associate a branch entry with a task. This ambiguity is more
-	 * likely to appear when the branch stack supports priv level
-	 * filtering and the user sets it to monitor only at the user
-	 * level (which could be a useful measurement in system-wide
-	 * mode). In that case, the risk is high of having a branch
-	 * stack with branch from multiple tasks.
- 	 */
-	if (sched_in) {
-		intel_pmu_lbr_reset();
-		cpuc->lbr_context = ctx;
-	}
-}
-
-static inline bool branch_user_callstack(unsigned br_sel)
-{
-	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	/*
-	 * Reset the LBR stack if we changed task context to
-	 * avoid data leaks.
-	 */
-	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-		intel_pmu_lbr_reset();
-		cpuc->lbr_context = event->ctx;
-	}
-	cpuc->br_sel = event->hw.branch_reg.reg;
-
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-					event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users++;
-	}
-
-	cpuc->lbr_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-					event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users--;
-	}
-
-	cpuc->lbr_users--;
-	WARN_ON_ONCE(cpuc->lbr_users < 0);
-	perf_sched_cb_dec(event->ctx->pmu);
-
-	if (cpuc->enabled && !cpuc->lbr_users) {
-		__intel_pmu_lbr_disable();
-		/* avoid stale pointer */
-		cpuc->lbr_context = NULL;
-	}
-}
-
-void intel_pmu_lbr_enable_all(bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->lbr_users)
-		__intel_pmu_lbr_enable(pmi);
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->lbr_users)
-		__intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
-	unsigned long mask = x86_pmu.lbr_nr - 1;
-	u64 tos = intel_pmu_lbr_tos();
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		unsigned long lbr_idx = (tos - i) & mask;
-		union {
-			struct {
-				u32 from;
-				u32 to;
-			};
-			u64     lbr;
-		} msr_lastbranch;
-
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
-		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
-		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
-		cpuc->lbr_entries[i].mispred	= 0;
-		cpuc->lbr_entries[i].predicted	= 0;
-		cpuc->lbr_entries[i].reserved	= 0;
-	}
-	cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
-	bool need_info = false;
-	unsigned long mask = x86_pmu.lbr_nr - 1;
-	int lbr_format = x86_pmu.intel_cap.lbr_format;
-	u64 tos = intel_pmu_lbr_tos();
-	int i;
-	int out = 0;
-	int num = x86_pmu.lbr_nr;
-
-	if (cpuc->lbr_sel) {
-		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
-		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-			num = tos;
-	}
-
-	for (i = 0; i < num; i++) {
-		unsigned long lbr_idx = (tos - i) & mask;
-		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
-		int skip = 0;
-		u16 cycles = 0;
-		int lbr_flags = lbr_desc[lbr_format];
-
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
-		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
-
-		if (lbr_format == LBR_FORMAT_INFO && need_info) {
-			u64 info;
-
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
-			mis = !!(info & LBR_INFO_MISPRED);
-			pred = !mis;
-			in_tx = !!(info & LBR_INFO_IN_TX);
-			abort = !!(info & LBR_INFO_ABORT);
-			cycles = (info & LBR_INFO_CYCLES);
-		}
-		if (lbr_flags & LBR_EIP_FLAGS) {
-			mis = !!(from & LBR_FROM_FLAG_MISPRED);
-			pred = !mis;
-			skip = 1;
-		}
-		if (lbr_flags & LBR_TSX) {
-			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
-			abort = !!(from & LBR_FROM_FLAG_ABORT);
-			skip = 3;
-		}
-		from = (u64)((((s64)from) << skip) >> skip);
-
-		/*
-		 * Some CPUs report duplicated abort records,
-		 * with the second entry not having an abort bit set.
-		 * Skip them here. This loop runs backwards,
-		 * so we need to undo the previous record.
-		 * If the abort just happened outside the window
-		 * the extra entry cannot be removed.
-		 */
-		if (abort && x86_pmu.lbr_double_abort && out > 0)
-			out--;
-
-		cpuc->lbr_entries[out].from	 = from;
-		cpuc->lbr_entries[out].to	 = to;
-		cpuc->lbr_entries[out].mispred	 = mis;
-		cpuc->lbr_entries[out].predicted = pred;
-		cpuc->lbr_entries[out].in_tx	 = in_tx;
-		cpuc->lbr_entries[out].abort	 = abort;
-		cpuc->lbr_entries[out].cycles	 = cycles;
-		cpuc->lbr_entries[out].reserved	 = 0;
-		out++;
-	}
-	cpuc->lbr_stack.nr = out;
-}
-
-void intel_pmu_lbr_read(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (!cpuc->lbr_users)
-		return;
-
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_read_32(cpuc);
-	else
-		intel_pmu_lbr_read_64(cpuc);
-
-	intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
-	u64 br_type = event->attr.branch_sample_type;
-	int mask = 0;
-
-	if (br_type & PERF_SAMPLE_BRANCH_USER)
-		mask |= X86_BR_USER;
-
-	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
-		mask |= X86_BR_KERNEL;
-
-	/* we ignore BRANCH_HV here */
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY)
-		mask |= X86_BR_ANY;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
-		mask |= X86_BR_ANY_CALL;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
-		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
-	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
-		mask |= X86_BR_IND_CALL;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
-		mask |= X86_BR_ABORT;
-
-	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
-		mask |= X86_BR_IN_TX;
-
-	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
-		mask |= X86_BR_NO_TX;
-
-	if (br_type & PERF_SAMPLE_BRANCH_COND)
-		mask |= X86_BR_JCC;
-
-	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
-		if (!x86_pmu_has_lbr_callstack())
-			return -EOPNOTSUPP;
-		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
-			return -EINVAL;
-		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
-			X86_BR_CALL_STACK;
-	}
-
-	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
-		mask |= X86_BR_IND_JMP;
-
-	if (br_type & PERF_SAMPLE_BRANCH_CALL)
-		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
-	/*
-	 * stash actual user request into reg, it may
-	 * be used by fixup code for some CPU
-	 */
-	event->hw.branch_reg.reg = mask;
-	return 0;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-	u64 br_type = event->attr.branch_sample_type;
-	u64 mask = 0, v;
-	int i;
-
-	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
-		if (!(br_type & (1ULL << i)))
-			continue;
-
-		v = x86_pmu.lbr_sel_map[i];
-		if (v == LBR_NOT_SUPP)
-			return -EOPNOTSUPP;
-
-		if (v != LBR_IGN)
-			mask |= v;
-	}
-
-	reg = &event->hw.branch_reg;
-	reg->idx = EXTRA_REG_LBR;
-
-	/*
-	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
-	 * in suppress mode. So LBR_SELECT should be set to
-	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
-	 */
-	reg->config = mask ^ x86_pmu.lbr_sel_mask;
-
-	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
-	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
-	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
-		reg->config |= LBR_NO_INFO;
-
-	return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
-	int ret = 0;
-
-	/*
-	 * no LBR on this PMU
-	 */
-	if (!x86_pmu.lbr_nr)
-		return -EOPNOTSUPP;
-
-	/*
-	 * setup SW LBR filter
-	 */
-	ret = intel_pmu_setup_sw_lbr_filter(event);
-	if (ret)
-		return ret;
-
-	/*
-	 * setup HW LBR filter, if any
-	 */
-	if (x86_pmu.lbr_sel_map)
-		ret = intel_pmu_setup_hw_lbr_filter(event);
-
-	return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
-	struct insn insn;
-	void *addr;
-	int bytes_read, bytes_left;
-	int ret = X86_BR_NONE;
-	int ext, to_plm, from_plm;
-	u8 buf[MAX_INSN_SIZE];
-	int is64 = 0;
-
-	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
-	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
-	/*
-	 * maybe zero if lbr did not fill up after a reset by the time
-	 * we get a PMU interrupt
-	 */
-	if (from == 0 || to == 0)
-		return X86_BR_NONE;
-
-	if (abort)
-		return X86_BR_ABORT | to_plm;
-
-	if (from_plm == X86_BR_USER) {
-		/*
-		 * can happen if measuring at the user level only
-		 * and we interrupt in a kernel thread, e.g., idle.
-		 */
-		if (!current->mm)
-			return X86_BR_NONE;
-
-		/* may fail if text not present */
-		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
-						MAX_INSN_SIZE);
-		bytes_read = MAX_INSN_SIZE - bytes_left;
-		if (!bytes_read)
-			return X86_BR_NONE;
-
-		addr = buf;
-	} else {
-		/*
-		 * The LBR logs any address in the IP, even if the IP just
-		 * faulted. This means userspace can control the from address.
-		 * Ensure we don't blindy read any address by validating it is
-		 * a known text address.
-		 */
-		if (kernel_text_address(from)) {
-			addr = (void *)from;
-			/*
-			 * Assume we can get the maximum possible size
-			 * when grabbing kernel data.  This is not
-			 * _strictly_ true since we could possibly be
-			 * executing up next to a memory hole, but
-			 * it is very unlikely to be a problem.
-			 */
-			bytes_read = MAX_INSN_SIZE;
-		} else {
-			return X86_BR_NONE;
-		}
-	}
-
-	/*
-	 * decoder needs to know the ABI especially
-	 * on 64-bit systems running 32-bit apps
-	 */
-#ifdef CONFIG_X86_64
-	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
-	insn_init(&insn, addr, bytes_read, is64);
-	insn_get_opcode(&insn);
-	if (!insn.opcode.got)
-		return X86_BR_ABORT;
-
-	switch (insn.opcode.bytes[0]) {
-	case 0xf:
-		switch (insn.opcode.bytes[1]) {
-		case 0x05: /* syscall */
-		case 0x34: /* sysenter */
-			ret = X86_BR_SYSCALL;
-			break;
-		case 0x07: /* sysret */
-		case 0x35: /* sysexit */
-			ret = X86_BR_SYSRET;
-			break;
-		case 0x80 ... 0x8f: /* conditional */
-			ret = X86_BR_JCC;
-			break;
-		default:
-			ret = X86_BR_NONE;
-		}
-		break;
-	case 0x70 ... 0x7f: /* conditional */
-		ret = X86_BR_JCC;
-		break;
-	case 0xc2: /* near ret */
-	case 0xc3: /* near ret */
-	case 0xca: /* far ret */
-	case 0xcb: /* far ret */
-		ret = X86_BR_RET;
-		break;
-	case 0xcf: /* iret */
-		ret = X86_BR_IRET;
-		break;
-	case 0xcc ... 0xce: /* int */
-		ret = X86_BR_INT;
-		break;
-	case 0xe8: /* call near rel */
-		insn_get_immediate(&insn);
-		if (insn.immediate1.value == 0) {
-			/* zero length call */
-			ret = X86_BR_ZERO_CALL;
-			break;
-		}
-	case 0x9a: /* call far absolute */
-		ret = X86_BR_CALL;
-		break;
-	case 0xe0 ... 0xe3: /* loop jmp */
-		ret = X86_BR_JCC;
-		break;
-	case 0xe9 ... 0xeb: /* jmp */
-		ret = X86_BR_JMP;
-		break;
-	case 0xff: /* call near absolute, call far absolute ind */
-		insn_get_modrm(&insn);
-		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
-		switch (ext) {
-		case 2: /* near ind call */
-		case 3: /* far ind call */
-			ret = X86_BR_IND_CALL;
-			break;
-		case 4:
-		case 5:
-			ret = X86_BR_IND_JMP;
-			break;
-		}
-		break;
-	default:
-		ret = X86_BR_NONE;
-	}
-	/*
-	 * interrupts, traps, faults (and thus ring transition) may
-	 * occur on any instructions. Thus, to classify them correctly,
-	 * we need to first look at the from and to priv levels. If they
-	 * are different and to is in the kernel, then it indicates
-	 * a ring transition. If the from instruction is not a ring
-	 * transition instr (syscall, systenter, int), then it means
-	 * it was a irq, trap or fault.
-	 *
-	 * we have no way of detecting kernel to kernel faults.
-	 */
-	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
-	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
-		ret = X86_BR_IRQ;
-
-	/*
-	 * branch priv level determined by target as
-	 * is done by HW when LBR_SELECT is implemented
-	 */
-	if (ret != X86_BR_NONE)
-		ret |= to_plm;
-
-	return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
-	u64 from, to;
-	int br_sel = cpuc->br_sel;
-	int i, j, type;
-	bool compress = false;
-
-	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
-		return;
-
-	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
-		from = cpuc->lbr_entries[i].from;
-		to = cpuc->lbr_entries[i].to;
-
-		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
-		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-			if (cpuc->lbr_entries[i].in_tx)
-				type |= X86_BR_IN_TX;
-			else
-				type |= X86_BR_NO_TX;
-		}
-
-		/* if type does not correspond, then discard */
-		if (type == X86_BR_NONE || (br_sel & type) != type) {
-			cpuc->lbr_entries[i].from = 0;
-			compress = true;
-		}
-	}
-
-	if (!compress)
-		return;
-
-	/* remove all entries with from=0 */
-	for (i = 0; i < cpuc->lbr_stack.nr; ) {
-		if (!cpuc->lbr_entries[i].from) {
-			j = i;
-			while (++j < cpuc->lbr_stack.nr)
-				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-			cpuc->lbr_stack.nr--;
-			if (!cpuc->lbr_entries[i].from)
-				continue;
-		}
-		i++;
-	}
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
-						| LBR_IND_JMP | LBR_FAR,
-	/*
-	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
-	 */
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
-	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
-	/*
-	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
-	 */
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
-};
-
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
-	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_RETURN | LBR_CALL_STACK,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
-};
-
-/* core */
-void __init intel_pmu_lbr_init_core(void)
-{
-	x86_pmu.lbr_nr     = 4;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-	/*
-	 * SW branch filter usage:
-	 * - compensate for lack of HW filter
-	 */
-	pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void __init intel_pmu_lbr_init_nhm(void)
-{
-	x86_pmu.lbr_nr     = 16;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - workaround LBR_SEL errata (see above)
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void __init intel_pmu_lbr_init_snb(void)
-{
-	x86_pmu.lbr_nr	 = 16;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("16-deep LBR, ");
-}
-
-/* haswell */
-void intel_pmu_lbr_init_hsw(void)
-{
-	x86_pmu.lbr_nr	 = 16;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-	pr_cont("16-deep LBR, ");
-}
-
-/* skylake */
-__init void intel_pmu_lbr_init_skl(void)
-{
-	x86_pmu.lbr_nr	 = 32;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("32-deep LBR, ");
-}
-
-/* atom */
-void __init intel_pmu_lbr_init_atom(void)
-{
-	/*
-	 * only models starting at stepping 10 seems
-	 * to have an operational LBR which can freeze
-	 * on PMU interrupt
-	 */
-	if (boot_cpu_data.x86_model == 28
-	    && boot_cpu_data.x86_mask < 10) {
-		pr_cont("LBR disabled due to erratum");
-		return;
-	}
-
-	x86_pmu.lbr_nr	   = 8;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-	/*
-	 * SW branch filter usage:
-	 * - compensate for lack of HW filter
-	 */
-	pr_cont("8-deep LBR, ");
-}
-
-/* Knights Landing */
-void intel_pmu_lbr_init_knl(void)
-{
-	x86_pmu.lbr_nr	   = 8;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-	pr_cont("8-deep LBR, ");
-}
-- 
cgit 


From fd1c601c25785ef38d698ff0091b5fe253074715 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:13 +0100
Subject: perf/x86: Move perf_event_intel_pt.[ch] ...... =>
 x86/events/intel/pt.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                  |    1 +
 arch/x86/events/intel/pt.c                | 1188 +++++++++++++++++++++++++++++
 arch/x86/events/intel/pt.h                |  116 +++
 arch/x86/kernel/cpu/Makefile              |    1 -
 arch/x86/kernel/cpu/intel_pt.h            |  116 ---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 1188 -----------------------------
 6 files changed, 1305 insertions(+), 1305 deletions(-)
 create mode 100644 arch/x86/events/intel/pt.c
 create mode 100644 arch/x86/events/intel/pt.h
 delete mode 100644 arch/x86/kernel/cpu/intel_pt.h
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_pt.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 48097fd4aa71..2a2f55d7b6f2 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
new file mode 100644
index 000000000000..e56cebe07994
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1188 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+
+#include "../../kernel/cpu/perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)						\
+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
+			    .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+	const char	*name;
+	u32		leaf;
+	u8		reg;
+	u32		mask;
+} pt_caps[] = {
+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
+	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
+	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+	struct pt_cap_desc *cd = &pt_caps[cap];
+	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+	unsigned int shift = __ffs(cd->mask);
+
+	return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	enum pt_capabilities cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+	.name	= "caps",
+};
+
+PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(mtc,		"config:9"	);
+PMU_FORMAT_ATTR(tsc,		"config:10"	);
+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
+PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
+PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
+
+static struct attribute *pt_formats_attr[] = {
+	&format_attr_cyc.attr,
+	&format_attr_mtc.attr,
+	&format_attr_tsc.attr,
+	&format_attr_noretcomp.attr,
+	&format_attr_mtc_period.attr,
+	&format_attr_cyc_thresh.attr,
+	&format_attr_psb_period.attr,
+	NULL,
+};
+
+static struct attribute_group pt_format_group = {
+	.name	= "format",
+	.attrs	= pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+	&pt_cap_group,
+	&pt_format_group,
+	NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+	struct dev_ext_attribute *de_attrs;
+	struct attribute **attrs;
+	size_t size;
+	int ret;
+	long i;
+
+	attrs = NULL;
+
+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
+		cpuid_count(20, i,
+			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+	}
+
+	ret = -ENOMEM;
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+	attrs = kzalloc(size, GFP_KERNEL);
+	if (!attrs)
+		goto fail;
+
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+	de_attrs = kzalloc(size, GFP_KERNEL);
+	if (!de_attrs)
+		goto fail;
+
+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+		struct dev_ext_attribute *de_attr = de_attrs + i;
+
+		de_attr->attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attr->attr.attr);
+
+		de_attr->attr.attr.mode		= S_IRUGO;
+		de_attr->attr.show		= pt_cap_show;
+		de_attr->var			= (void *)i;
+
+		attrs[i] = &de_attr->attr.attr;
+	}
+
+	pt_cap_group.attrs = attrs;
+
+	return 0;
+
+fail:
+	kfree(attrs);
+
+	return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
+			  RTIT_CTL_CYC_THRESH	| \
+			  RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
+			 RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_CYC_PSB	| \
+			RTIT_CTL_MTC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+	u64 config = event->attr.config;
+	u64 allowed, requested;
+
+	if ((config & PT_CONFIG_MASK) != config)
+		return false;
+
+	if (config & RTIT_CTL_CYC_PSB) {
+		if (!pt_cap_get(PT_CAP_psb_cyc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_psb_periods);
+		requested = (config & RTIT_CTL_PSB_FREQ) >>
+			RTIT_CTL_PSB_FREQ_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+		requested = (config & RTIT_CTL_CYC_THRESH) >>
+			RTIT_CTL_CYC_THRESH_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+	}
+
+	if (config & RTIT_CTL_MTC) {
+		/*
+		 * In the unlikely case that CPUID lists valid mtc periods,
+		 * but not the mtc capability, drop out here.
+		 *
+		 * Spec says that setting mtc period bits while mtc bit in
+		 * CPUID is 0 will #GP, so better safe than sorry.
+		 */
+		if (!pt_cap_get(PT_CAP_mtc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_mtc_periods);
+		if (!allowed)
+			return false;
+
+		requested = (config & RTIT_CTL_MTC_RANGE) >>
+			RTIT_CTL_MTC_RANGE_OFFSET;
+
+		if (!(allowed & BIT(requested)))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config(struct perf_event *event)
+{
+	u64 reg;
+
+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+	if (!event->attr.exclude_kernel)
+		reg |= RTIT_CTL_OS;
+	if (!event->attr.exclude_user)
+		reg |= RTIT_CTL_USR;
+
+	reg |= (event->attr.config & PT_CONFIG_MASK);
+
+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (start)
+		ctl |= RTIT_CTL_TRACEEN;
+	else
+		ctl &= ~RTIT_CTL_TRACEEN;
+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	/*
+	 * A wrmsr that disables trace generation serializes other PT
+	 * registers and causes all data packets to be written to memory,
+	 * but a fence is required for the data to become globally visible.
+	 *
+	 * The below WMB, separating data store and aux_head store matches
+	 * the consumer's RMB that separates aux_head load and data load.
+	 */
+	if (!start)
+		wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+			     unsigned int output_off)
+{
+	u64 reg;
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:	actual ToPA table entries, as understood by PT hardware
+ * @list:	linkage to struct pt_buffer's list of tables
+ * @phys:	physical address of this page
+ * @offset:	offset of the first entry in this table in the buffer
+ * @size:	total size of all entries in this table
+ * @last:	index of the last initialized entry in this table
+ */
+struct topa {
+	struct topa_entry	table[TENTS_PER_PAGE];
+	struct list_head	list;
+	u64			phys;
+	u64			offset;
+	size_t			size;
+	int			last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:	CPU on which to allocate.
+ * @gfp:	Allocation flags.
+ *
+ * Return:	On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+	int node = cpu_to_node(cpu);
+	struct topa *topa;
+	struct page *p;
+
+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+	if (!p)
+		return NULL;
+
+	topa = page_address(p);
+	topa->last = 0;
+	topa->phys = page_to_phys(p);
+
+	/*
+	 * In case of singe-entry ToPA, always put the self-referencing END
+	 * link as the 2nd entry in the table
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(topa, 1)->end = 1;
+	}
+
+	return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:	Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+	free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:	 PT buffer that's being extended.
+ * @topa:	 New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+	struct topa *last = buf->last;
+
+	list_add_tail(&topa->list, &buf->tables);
+
+	if (!buf->first) {
+		buf->first = buf->last = buf->cur = topa;
+		return;
+	}
+
+	topa->offset = last->offset + last->size;
+	buf->last = topa;
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return;
+
+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+	TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:	ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+	/* single-entry ToPA is a special case */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return !!topa->last;
+
+	return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:	PT buffer being initialized.
+ * @gfp:	Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:	0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+	struct topa *topa = buf->last;
+	int order = 0;
+	struct page *p;
+
+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (topa_table_full(topa)) {
+		topa = topa_alloc(buf->cpu, gfp);
+		if (!topa)
+			return -ENOMEM;
+
+		topa_insert_table(buf, topa);
+	}
+
+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+	TOPA_ENTRY(topa, -1)->size = order;
+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, -1)->intr = 1;
+		TOPA_ENTRY(topa, -1)->stop = 1;
+	}
+
+	topa->last++;
+	topa->size += sizes(order);
+
+	buf->nr_pages += 1ul << order;
+
+	return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:	PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+	struct topa *topa;
+
+	list_for_each_entry(topa, &buf->tables, list) {
+		int i;
+
+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+			 topa->phys, topa->offset, topa->size);
+		for (i = 0; i < TENTS_PER_PAGE; i++) {
+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+				 &topa->table[i],
+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
+				 sizes(topa->table[i].size),
+				 topa->table[i].end ?  'E' : ' ',
+				 topa->table[i].intr ? 'I' : ' ',
+				 topa->table[i].stop ? 'S' : ' ',
+				 *(u64 *)&topa->table[i]);
+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+			     topa->table[i].stop) ||
+			    topa->table[i].end)
+				break;
+		}
+	}
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:	PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+	buf->output_off = 0;
+	buf->cur_idx++;
+
+	if (buf->cur_idx == buf->cur->last) {
+		if (buf->cur == buf->last)
+			buf->cur = buf->first;
+		else
+			buf->cur = list_entry(buf->cur->list.next, struct topa,
+					      list);
+		buf->cur_idx = 0;
+	}
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:		Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	u64 topa_idx, base, old;
+
+	/* offset of the first region in this table from the beginning of buf */
+	base = buf->cur->offset + buf->output_off;
+
+	/* offset of the current output region within this table */
+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+		base += sizes(buf->cur->table[topa_idx].size);
+
+	if (buf->snapshot) {
+		local_set(&buf->data_size, base);
+	} else {
+		old = (local64_xchg(&buf->head, base) &
+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
+		if (base < old)
+			base += buf->nr_pages << PAGE_SHIFT;
+
+		local_add(base - old, &buf->data_size);
+	}
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:	PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:	PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+	return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:		Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	int advance = 0;
+	u64 status;
+
+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+	if (status & RTIT_STATUS_ERROR) {
+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+		pt_topa_dump(buf);
+		status &= ~RTIT_STATUS_ERROR;
+	}
+
+	if (status & RTIT_STATUS_STOPPED) {
+		status &= ~RTIT_STATUS_STOPPED;
+
+		/*
+		 * On systems that only do single-entry ToPA, hitting STOP
+		 * means we are already losing data; need to let the decoder
+		 * know.
+		 */
+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+			local_inc(&buf->lost);
+			advance++;
+		}
+	}
+
+	/*
+	 * Also on single-entry ToPA implementations, interrupt will come
+	 * before the output reaches its output region's boundary.
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+		void *head = pt_buffer_region(buf);
+
+		/* everything within this margin needs to be zeroed out */
+		memset(head + buf->output_off, 0,
+		       pt_buffer_region_size(buf) -
+		       buf->output_off);
+		advance++;
+	}
+
+	if (advance)
+		pt_buffer_advance(buf);
+
+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:	PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+	u64 offset, base_topa;
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+	buf->cur = phys_to_virt(base_topa);
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	/* offset within current output region */
+	buf->output_off = offset >> 32;
+	/* index of current output region within this table */
+	buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:	PT buffer.
+ * @pg:		Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+	struct topa_entry *te = buf->topa_index[pg];
+
+	/* one region */
+	if (buf->first == buf->last && buf->first->last == 1)
+		return pg;
+
+	do {
+		pg++;
+		pg &= buf->nr_pages - 1;
+	} while (buf->topa_index[pg] == te);
+
+	return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:	PT buffer.
+ * @handle:	Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+				   struct perf_output_handle *handle)
+
+{
+	unsigned long head = local64_read(&buf->head);
+	unsigned long idx, npages, wakeup;
+
+	/* can't stop in the middle of an output region */
+	if (buf->output_off + handle->size + 1 <
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+		return -EINVAL;
+
+
+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return 0;
+
+	/* clear STOP and INT from current entry */
+	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->intr_pos]->intr = 0;
+
+	/* how many pages till the STOP marker */
+	npages = handle->size >> PAGE_SHIFT;
+
+	/* if it's on a page boundary, fill up one more page */
+	if (!offset_in_page(head + handle->size + 1))
+		npages++;
+
+	idx = (head >> PAGE_SHIFT) + npages;
+	idx &= buf->nr_pages - 1;
+	buf->stop_pos = idx;
+
+	wakeup = handle->wakeup >> PAGE_SHIFT;
+
+	/* in the worst case, wake up the consumer one page before hard stop */
+	idx = (head >> PAGE_SHIFT) + npages - 1;
+	if (idx > wakeup)
+		idx = wakeup;
+
+	idx &= buf->nr_pages - 1;
+	buf->intr_pos = idx;
+
+	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->intr_pos]->intr = 1;
+
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:	PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+	struct topa *cur = buf->first, *prev = buf->last;
+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
+	int pg = 0, idx = 0;
+
+	while (pg < buf->nr_pages) {
+		int tidx;
+
+		/* pages within one topa entry */
+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+			buf->topa_index[pg] = te_prev;
+
+		te_prev = te_cur;
+
+		if (idx == cur->last - 1) {
+			/* advance to next topa table */
+			idx = 0;
+			cur = list_entry(cur->list.next, struct topa, list);
+		} else {
+			idx++;
+		}
+		te_cur = TOPA_ENTRY(cur, idx);
+	}
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:	PT buffer.
+ * @head:	Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+	int pg;
+
+	if (buf->snapshot)
+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+	pg = pt_topa_next_entry(buf, pg);
+
+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+	local64_set(&buf->head, head);
+	local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:	PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+	struct topa *topa, *iter;
+
+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+		/*
+		 * right now, this is in free_aux() path only, so
+		 * no need to unlink this table from the list
+		 */
+		topa_free(topa);
+	}
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:	PT buffer.
+ * @size:	Total size of all regions within this ToPA.
+ * @gfp:	Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+			       gfp_t gfp)
+{
+	struct topa *topa;
+	int err;
+
+	topa = topa_alloc(buf->cpu, gfp);
+	if (!topa)
+		return -ENOMEM;
+
+	topa_insert_table(buf, topa);
+
+	while (buf->nr_pages < nr_pages) {
+		err = topa_insert_pages(buf, gfp);
+		if (err) {
+			pt_buffer_fini_topa(buf);
+			return -ENOMEM;
+		}
+	}
+
+	pt_buffer_setup_topa_index(buf);
+
+	/* link last table to the first one, unless we're double buffering */
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(buf->last, -1)->end = 1;
+	}
+
+	pt_topa_dump(buf);
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:	Cpu on which to allocate, -1 means current.
+ * @pages:	Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:	Number of pages in the buffer.
+ * @snapshot:	If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:	Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+	struct pt_buffer *buf;
+	int node, ret;
+
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+			   GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->cpu = cpu;
+	buf->snapshot = snapshot;
+	buf->data_pages = pages;
+
+	INIT_LIST_HEAD(&buf->tables);
+
+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:	PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+	struct pt_buffer *buf = data;
+
+	pt_buffer_fini_topa(buf);
+	kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:	PT buffer.
+ * @pt:		Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= pt->handle.size)
+		return true;
+
+	return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+	struct perf_event *event = pt->handle.event;
+
+	/*
+	 * There may be a dangling PT bit in the interrupt status register
+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
+	 * do anything (particularly, re-enable) for this event here.
+	 */
+	if (!ACCESS_ONCE(pt->handle_nmi))
+		return;
+
+	pt_config_start(false);
+
+	if (!event)
+		return;
+
+	buf = perf_get_aux(&pt->handle);
+	if (!buf)
+		return;
+
+	pt_read_offset(buf);
+
+	pt_handle_status(pt);
+
+	pt_update_head(pt);
+
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+			    local_xchg(&buf->lost, 0));
+
+	if (!event->hw.state) {
+		int ret;
+
+		buf = perf_aux_output_begin(&pt->handle, event);
+		if (!buf) {
+			event->hw.state = PERF_HES_STOPPED;
+			return;
+		}
+
+		pt_buffer_reset_offsets(buf, pt->handle.head);
+		/* snapshot counters don't use PMI, so it's safe */
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			return;
+		}
+
+		pt_config_buffer(buf->cur->table, buf->cur_idx,
+				 buf->output_off);
+		pt_config(event);
+	}
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+	if (!buf || pt_buffer_is_full(buf, pt)) {
+		event->hw.state = PERF_HES_STOPPED;
+		return;
+	}
+
+	ACCESS_ONCE(pt->handle_nmi) = 1;
+	event->hw.state = 0;
+
+	pt_config_buffer(buf->cur->table, buf->cur_idx,
+			 buf->output_off);
+	pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	/*
+	 * Protect against the PMI racing with disabling wrmsr,
+	 * see comment in intel_pt_interrupt().
+	 */
+	ACCESS_ONCE(pt->handle_nmi) = 0;
+	pt_config_start(false);
+
+	if (event->hw.state == PERF_HES_STOPPED)
+		return;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_UPDATE) {
+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+		if (!buf)
+			return;
+
+		if (WARN_ON_ONCE(pt->handle.event != event))
+			return;
+
+		pt_read_offset(buf);
+
+		pt_handle_status(pt);
+
+		pt_update_head(pt);
+	}
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+
+	pt_event_stop(event, PERF_EF_UPDATE);
+
+	buf = perf_get_aux(&pt->handle);
+
+	if (buf) {
+		if (buf->snapshot)
+			pt->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+				    local_xchg(&buf->lost, 0));
+	}
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+	struct pt_buffer *buf;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	if (pt->handle.event)
+		goto fail;
+
+	buf = perf_aux_output_begin(&pt->handle, event);
+	ret = -EINVAL;
+	if (!buf)
+		goto fail_stop;
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret)
+			goto fail_end_stop;
+	}
+
+	if (mode & PERF_EF_START) {
+		pt_event_start(event, 0);
+		ret = -EBUSY;
+		if (hwc->state == PERF_HES_STOPPED)
+			goto fail_end_stop;
+	} else {
+		hwc->state = PERF_HES_STOPPED;
+	}
+
+	return 0;
+
+fail_end_stop:
+	perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+	hwc->state = PERF_HES_STOPPED;
+fail:
+	return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+	if (event->attr.type != pt_pmu.pmu.type)
+		return -ENOENT;
+
+	if (!pt_event_valid(event))
+		return -EINVAL;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
+		return -EBUSY;
+
+	event->destroy = pt_event_destroy;
+
+	return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	if (pt->handle.event)
+		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+	int ret, cpu, prior_warn = 0;
+
+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		return -ENODEV;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		u64 ctl;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
+			prior_warn++;
+	}
+	put_online_cpus();
+
+	if (prior_warn) {
+		x86_add_exclusive(x86_lbr_exclusive_pt);
+		pr_warn("PT is enabled at boot time, doing nothing\n");
+
+		return -EBUSY;
+	}
+
+	ret = pt_pmu_hw_init();
+	if (ret)
+		return ret;
+
+	if (!pt_cap_get(PT_CAP_topa_output)) {
+		pr_warn("ToPA output is not supported on this CPU\n");
+		return -ENODEV;
+	}
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		pt_pmu.pmu.capabilities =
+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
+	pt_pmu.pmu.event_init	= pt_event_init;
+	pt_pmu.pmu.add		= pt_event_add;
+	pt_pmu.pmu.del		= pt_event_del;
+	pt_pmu.pmu.start	= pt_event_start;
+	pt_pmu.pmu.stop		= pt_event_stop;
+	pt_pmu.pmu.read		= pt_event_read;
+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+	return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
new file mode 100644
index 000000000000..336878a5d205
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+	return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+	u64	end	: 1;
+	u64	rsvd0	: 1;
+	u64	intr	: 1;
+	u64	rsvd1	: 1;
+	u64	stop	: 1;
+	u64	rsvd2	: 1;
+	u64	size	: 4;
+	u64	rsvd3	: 2;
+	u64	base	: 36;
+	u64	rsvd4	: 16;
+};
+
+#define PT_CPUID_LEAVES		2
+#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+	PT_CAP_max_subleaf = 0,
+	PT_CAP_cr3_filtering,
+	PT_CAP_psb_cyc,
+	PT_CAP_mtc,
+	PT_CAP_topa_output,
+	PT_CAP_topa_multiple_entries,
+	PT_CAP_single_range_output,
+	PT_CAP_payloads_lip,
+	PT_CAP_mtc_periods,
+	PT_CAP_cycle_thresholds,
+	PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+	struct pmu		pmu;
+	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *		cpu, depending on perf event configuration
+ * @cpu:	cpu for per-cpu allocation
+ * @tables:	list of ToPA tables in this buffer
+ * @first:	shorthand for first topa table
+ * @last:	shorthand for last topa table
+ * @cur:	current topa table
+ * @nr_pages:	buffer size in pages
+ * @cur_idx:	current output region's index within @cur table
+ * @output_off:	offset within the current output region
+ * @data_size:	running total of the amount of data in this buffer
+ * @lost:	if data was lost/truncated
+ * @head:	logical write offset inside the buffer
+ * @snapshot:	if this is for a snapshot/overwrite counter
+ * @stop_pos:	STOP topa entry in the buffer
+ * @intr_pos:	INT topa entry in the buffer
+ * @data_pages:	array of pages from perf
+ * @topa_index:	table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+	int			cpu;
+	struct list_head	tables;
+	struct topa		*first, *last, *cur;
+	unsigned int		cur_idx;
+	size_t			output_off;
+	unsigned long		nr_pages;
+	local_t			data_size;
+	local_t			lost;
+	local64_t		head;
+	bool			snapshot;
+	unsigned long		stop_pos, intr_pos;
+	void			**data_pages;
+	struct topa_entry	*topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:	perf output handle
+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+	struct perf_output_handle handle;
+	int			handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d52ed9b918b3..595e3ff6cd1b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
deleted file mode 100644
index 336878a5d205..000000000000
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#ifndef __INTEL_PT_H__
-#define __INTEL_PT_H__
-
-/*
- * Single-entry ToPA: when this close to region boundary, switch
- * buffers to avoid losing data.
- */
-#define TOPA_PMI_MARGIN 512
-
-#define TOPA_SHIFT 12
-
-static inline unsigned int sizes(unsigned int tsz)
-{
-	return 1 << (tsz + TOPA_SHIFT);
-};
-
-struct topa_entry {
-	u64	end	: 1;
-	u64	rsvd0	: 1;
-	u64	intr	: 1;
-	u64	rsvd1	: 1;
-	u64	stop	: 1;
-	u64	rsvd2	: 1;
-	u64	size	: 4;
-	u64	rsvd3	: 2;
-	u64	base	: 36;
-	u64	rsvd4	: 16;
-};
-
-#define PT_CPUID_LEAVES		2
-#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
-
-enum pt_capabilities {
-	PT_CAP_max_subleaf = 0,
-	PT_CAP_cr3_filtering,
-	PT_CAP_psb_cyc,
-	PT_CAP_mtc,
-	PT_CAP_topa_output,
-	PT_CAP_topa_multiple_entries,
-	PT_CAP_single_range_output,
-	PT_CAP_payloads_lip,
-	PT_CAP_mtc_periods,
-	PT_CAP_cycle_thresholds,
-	PT_CAP_psb_periods,
-};
-
-struct pt_pmu {
-	struct pmu		pmu;
-	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-};
-
-/**
- * struct pt_buffer - buffer configuration; one buffer per task_struct or
- *		cpu, depending on perf event configuration
- * @cpu:	cpu for per-cpu allocation
- * @tables:	list of ToPA tables in this buffer
- * @first:	shorthand for first topa table
- * @last:	shorthand for last topa table
- * @cur:	current topa table
- * @nr_pages:	buffer size in pages
- * @cur_idx:	current output region's index within @cur table
- * @output_off:	offset within the current output region
- * @data_size:	running total of the amount of data in this buffer
- * @lost:	if data was lost/truncated
- * @head:	logical write offset inside the buffer
- * @snapshot:	if this is for a snapshot/overwrite counter
- * @stop_pos:	STOP topa entry in the buffer
- * @intr_pos:	INT topa entry in the buffer
- * @data_pages:	array of pages from perf
- * @topa_index:	table of topa entries indexed by page offset
- */
-struct pt_buffer {
-	int			cpu;
-	struct list_head	tables;
-	struct topa		*first, *last, *cur;
-	unsigned int		cur_idx;
-	size_t			output_off;
-	unsigned long		nr_pages;
-	local_t			data_size;
-	local_t			lost;
-	local64_t		head;
-	bool			snapshot;
-	unsigned long		stop_pos, intr_pos;
-	void			**data_pages;
-	struct topa_entry	*topa_index[0];
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:	perf output handle
- * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
- */
-struct pt {
-	struct perf_output_handle handle;
-	int			handle_nmi;
-};
-
-#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
deleted file mode 100644
index c0bbd1033b7c..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ /dev/null
@@ -1,1188 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/intel_pt.h>
-
-#include "perf_event.h"
-#include "intel_pt.h"
-
-static DEFINE_PER_CPU(struct pt, pt_ctx);
-
-static struct pt_pmu pt_pmu;
-
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
-};
-
-/*
- * Capabilities of Intel PT hardware, such as number of address bits or
- * supported output schemes, are cached and exported to userspace as "caps"
- * attribute group of pt pmu device
- * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
- * relevant bits together with intel_pt traces.
- *
- * These are necessary for both trace decoding (payloads_lip, contains address
- * width encoded in IP-related packets), and event configuration (bitmasks with
- * permitted values for certain bit fields).
- */
-#define PT_CAP(_n, _l, _r, _m)						\
-	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
-			    .reg = _r, .mask = _m }
-
-static struct pt_cap_desc {
-	const char	*name;
-	u32		leaf;
-	u8		reg;
-	u32		mask;
-} pt_caps[] = {
-	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
-	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
-	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
-	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
-	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
-	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
-	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
-	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
-	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
-	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
-	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
-};
-
-static u32 pt_cap_get(enum pt_capabilities cap)
-{
-	struct pt_cap_desc *cd = &pt_caps[cap];
-	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
-	unsigned int shift = __ffs(cd->mask);
-
-	return (c & cd->mask) >> shift;
-}
-
-static ssize_t pt_cap_show(struct device *cdev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	struct dev_ext_attribute *ea =
-		container_of(attr, struct dev_ext_attribute, attr);
-	enum pt_capabilities cap = (long)ea->var;
-
-	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
-}
-
-static struct attribute_group pt_cap_group = {
-	.name	= "caps",
-};
-
-PMU_FORMAT_ATTR(cyc,		"config:1"	);
-PMU_FORMAT_ATTR(mtc,		"config:9"	);
-PMU_FORMAT_ATTR(tsc,		"config:10"	);
-PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
-PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
-PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
-PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
-
-static struct attribute *pt_formats_attr[] = {
-	&format_attr_cyc.attr,
-	&format_attr_mtc.attr,
-	&format_attr_tsc.attr,
-	&format_attr_noretcomp.attr,
-	&format_attr_mtc_period.attr,
-	&format_attr_cyc_thresh.attr,
-	&format_attr_psb_period.attr,
-	NULL,
-};
-
-static struct attribute_group pt_format_group = {
-	.name	= "format",
-	.attrs	= pt_formats_attr,
-};
-
-static const struct attribute_group *pt_attr_groups[] = {
-	&pt_cap_group,
-	&pt_format_group,
-	NULL,
-};
-
-static int __init pt_pmu_hw_init(void)
-{
-	struct dev_ext_attribute *de_attrs;
-	struct attribute **attrs;
-	size_t size;
-	int ret;
-	long i;
-
-	attrs = NULL;
-
-	for (i = 0; i < PT_CPUID_LEAVES; i++) {
-		cpuid_count(20, i,
-			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
-	}
-
-	ret = -ENOMEM;
-	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
-	attrs = kzalloc(size, GFP_KERNEL);
-	if (!attrs)
-		goto fail;
-
-	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
-	de_attrs = kzalloc(size, GFP_KERNEL);
-	if (!de_attrs)
-		goto fail;
-
-	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-		struct dev_ext_attribute *de_attr = de_attrs + i;
-
-		de_attr->attr.attr.name = pt_caps[i].name;
-
-		sysfs_attr_init(&de_attr->attr.attr);
-
-		de_attr->attr.attr.mode		= S_IRUGO;
-		de_attr->attr.show		= pt_cap_show;
-		de_attr->var			= (void *)i;
-
-		attrs[i] = &de_attr->attr.attr;
-	}
-
-	pt_cap_group.attrs = attrs;
-
-	return 0;
-
-fail:
-	kfree(attrs);
-
-	return ret;
-}
-
-#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
-			  RTIT_CTL_CYC_THRESH	| \
-			  RTIT_CTL_PSB_FREQ)
-
-#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
-			 RTIT_CTL_MTC_RANGE)
-
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
-			RTIT_CTL_DISRETC	| \
-			RTIT_CTL_CYC_PSB	| \
-			RTIT_CTL_MTC)
-
-static bool pt_event_valid(struct perf_event *event)
-{
-	u64 config = event->attr.config;
-	u64 allowed, requested;
-
-	if ((config & PT_CONFIG_MASK) != config)
-		return false;
-
-	if (config & RTIT_CTL_CYC_PSB) {
-		if (!pt_cap_get(PT_CAP_psb_cyc))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_psb_periods);
-		requested = (config & RTIT_CTL_PSB_FREQ) >>
-			RTIT_CTL_PSB_FREQ_OFFSET;
-		if (requested && (!(allowed & BIT(requested))))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
-		requested = (config & RTIT_CTL_CYC_THRESH) >>
-			RTIT_CTL_CYC_THRESH_OFFSET;
-		if (requested && (!(allowed & BIT(requested))))
-			return false;
-	}
-
-	if (config & RTIT_CTL_MTC) {
-		/*
-		 * In the unlikely case that CPUID lists valid mtc periods,
-		 * but not the mtc capability, drop out here.
-		 *
-		 * Spec says that setting mtc period bits while mtc bit in
-		 * CPUID is 0 will #GP, so better safe than sorry.
-		 */
-		if (!pt_cap_get(PT_CAP_mtc))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_mtc_periods);
-		if (!allowed)
-			return false;
-
-		requested = (config & RTIT_CTL_MTC_RANGE) >>
-			RTIT_CTL_MTC_RANGE_OFFSET;
-
-		if (!(allowed & BIT(requested)))
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * PT configuration helpers
- * These all are cpu affine and operate on a local PT
- */
-
-static void pt_config(struct perf_event *event)
-{
-	u64 reg;
-
-	if (!event->hw.itrace_started) {
-		event->hw.itrace_started = 1;
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
-	}
-
-	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
-
-	if (!event->attr.exclude_kernel)
-		reg |= RTIT_CTL_OS;
-	if (!event->attr.exclude_user)
-		reg |= RTIT_CTL_USR;
-
-	reg |= (event->attr.config & PT_CONFIG_MASK);
-
-	wrmsrl(MSR_IA32_RTIT_CTL, reg);
-}
-
-static void pt_config_start(bool start)
-{
-	u64 ctl;
-
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-	if (start)
-		ctl |= RTIT_CTL_TRACEEN;
-	else
-		ctl &= ~RTIT_CTL_TRACEEN;
-	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-	/*
-	 * A wrmsr that disables trace generation serializes other PT
-	 * registers and causes all data packets to be written to memory,
-	 * but a fence is required for the data to become globally visible.
-	 *
-	 * The below WMB, separating data store and aux_head store matches
-	 * the consumer's RMB that separates aux_head load and data load.
-	 */
-	if (!start)
-		wmb();
-}
-
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-			     unsigned int output_off)
-{
-	u64 reg;
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
-/*
- * Keep ToPA table-related metadata on the same page as the actual table,
- * taking up a few words from the top
- */
-
-#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
-
-/**
- * struct topa - page-sized ToPA table with metadata at the top
- * @table:	actual ToPA table entries, as understood by PT hardware
- * @list:	linkage to struct pt_buffer's list of tables
- * @phys:	physical address of this page
- * @offset:	offset of the first entry in this table in the buffer
- * @size:	total size of all entries in this table
- * @last:	index of the last initialized entry in this table
- */
-struct topa {
-	struct topa_entry	table[TENTS_PER_PAGE];
-	struct list_head	list;
-	u64			phys;
-	u64			offset;
-	size_t			size;
-	int			last;
-};
-
-/* make -1 stand for the last table entry */
-#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
-
-/**
- * topa_alloc() - allocate page-sized ToPA table
- * @cpu:	CPU on which to allocate.
- * @gfp:	Allocation flags.
- *
- * Return:	On success, return the pointer to ToPA table page.
- */
-static struct topa *topa_alloc(int cpu, gfp_t gfp)
-{
-	int node = cpu_to_node(cpu);
-	struct topa *topa;
-	struct page *p;
-
-	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-	if (!p)
-		return NULL;
-
-	topa = page_address(p);
-	topa->last = 0;
-	topa->phys = page_to_phys(p);
-
-	/*
-	 * In case of singe-entry ToPA, always put the self-referencing END
-	 * link as the 2nd entry in the table
-	 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
-		TOPA_ENTRY(topa, 1)->end = 1;
-	}
-
-	return topa;
-}
-
-/**
- * topa_free() - free a page-sized ToPA table
- * @topa:	Table to deallocate.
- */
-static void topa_free(struct topa *topa)
-{
-	free_page((unsigned long)topa);
-}
-
-/**
- * topa_insert_table() - insert a ToPA table into a buffer
- * @buf:	 PT buffer that's being extended.
- * @topa:	 New topa table to be inserted.
- *
- * If it's the first table in this buffer, set up buffer's pointers
- * accordingly; otherwise, add a END=1 link entry to @topa to the current
- * "last" table and adjust the last table pointer to @topa.
- */
-static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
-{
-	struct topa *last = buf->last;
-
-	list_add_tail(&topa->list, &buf->tables);
-
-	if (!buf->first) {
-		buf->first = buf->last = buf->cur = topa;
-		return;
-	}
-
-	topa->offset = last->offset + last->size;
-	buf->last = topa;
-
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return;
-
-	BUG_ON(last->last != TENTS_PER_PAGE - 1);
-
-	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
-	TOPA_ENTRY(last, -1)->end = 1;
-}
-
-/**
- * topa_table_full() - check if a ToPA table is filled up
- * @topa:	ToPA table.
- */
-static bool topa_table_full(struct topa *topa)
-{
-	/* single-entry ToPA is a special case */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return !!topa->last;
-
-	return topa->last == TENTS_PER_PAGE - 1;
-}
-
-/**
- * topa_insert_pages() - create a list of ToPA tables
- * @buf:	PT buffer being initialized.
- * @gfp:	Allocation flags.
- *
- * This initializes a list of ToPA tables with entries from
- * the data_pages provided by rb_alloc_aux().
- *
- * Return:	0 on success or error code.
- */
-static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
-{
-	struct topa *topa = buf->last;
-	int order = 0;
-	struct page *p;
-
-	p = virt_to_page(buf->data_pages[buf->nr_pages]);
-	if (PagePrivate(p))
-		order = page_private(p);
-
-	if (topa_table_full(topa)) {
-		topa = topa_alloc(buf->cpu, gfp);
-		if (!topa)
-			return -ENOMEM;
-
-		topa_insert_table(buf, topa);
-	}
-
-	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
-	TOPA_ENTRY(topa, -1)->size = order;
-	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(topa, -1)->intr = 1;
-		TOPA_ENTRY(topa, -1)->stop = 1;
-	}
-
-	topa->last++;
-	topa->size += sizes(order);
-
-	buf->nr_pages += 1ul << order;
-
-	return 0;
-}
-
-/**
- * pt_topa_dump() - print ToPA tables and their entries
- * @buf:	PT buffer.
- */
-static void pt_topa_dump(struct pt_buffer *buf)
-{
-	struct topa *topa;
-
-	list_for_each_entry(topa, &buf->tables, list) {
-		int i;
-
-		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
-			 topa->phys, topa->offset, topa->size);
-		for (i = 0; i < TENTS_PER_PAGE; i++) {
-			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
-				 &topa->table[i],
-				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
-				 sizes(topa->table[i].size),
-				 topa->table[i].end ?  'E' : ' ',
-				 topa->table[i].intr ? 'I' : ' ',
-				 topa->table[i].stop ? 'S' : ' ',
-				 *(u64 *)&topa->table[i]);
-			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
-			     topa->table[i].stop) ||
-			    topa->table[i].end)
-				break;
-		}
-	}
-}
-
-/**
- * pt_buffer_advance() - advance to the next output region
- * @buf:	PT buffer.
- *
- * Advance the current pointers in the buffer to the next ToPA entry.
- */
-static void pt_buffer_advance(struct pt_buffer *buf)
-{
-	buf->output_off = 0;
-	buf->cur_idx++;
-
-	if (buf->cur_idx == buf->cur->last) {
-		if (buf->cur == buf->last)
-			buf->cur = buf->first;
-		else
-			buf->cur = list_entry(buf->cur->list.next, struct topa,
-					      list);
-		buf->cur_idx = 0;
-	}
-}
-
-/**
- * pt_update_head() - calculate current offsets and sizes
- * @pt:		Per-cpu pt context.
- *
- * Update buffer's current write pointer position and data size.
- */
-static void pt_update_head(struct pt *pt)
-{
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-	u64 topa_idx, base, old;
-
-	/* offset of the first region in this table from the beginning of buf */
-	base = buf->cur->offset + buf->output_off;
-
-	/* offset of the current output region within this table */
-	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
-		base += sizes(buf->cur->table[topa_idx].size);
-
-	if (buf->snapshot) {
-		local_set(&buf->data_size, base);
-	} else {
-		old = (local64_xchg(&buf->head, base) &
-		       ((buf->nr_pages << PAGE_SHIFT) - 1));
-		if (base < old)
-			base += buf->nr_pages << PAGE_SHIFT;
-
-		local_add(base - old, &buf->data_size);
-	}
-}
-
-/**
- * pt_buffer_region() - obtain current output region's address
- * @buf:	PT buffer.
- */
-static void *pt_buffer_region(struct pt_buffer *buf)
-{
-	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
-}
-
-/**
- * pt_buffer_region_size() - obtain current output region's size
- * @buf:	PT buffer.
- */
-static size_t pt_buffer_region_size(struct pt_buffer *buf)
-{
-	return sizes(buf->cur->table[buf->cur_idx].size);
-}
-
-/**
- * pt_handle_status() - take care of possible status conditions
- * @pt:		Per-cpu pt context.
- */
-static void pt_handle_status(struct pt *pt)
-{
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-	int advance = 0;
-	u64 status;
-
-	rdmsrl(MSR_IA32_RTIT_STATUS, status);
-
-	if (status & RTIT_STATUS_ERROR) {
-		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
-		pt_topa_dump(buf);
-		status &= ~RTIT_STATUS_ERROR;
-	}
-
-	if (status & RTIT_STATUS_STOPPED) {
-		status &= ~RTIT_STATUS_STOPPED;
-
-		/*
-		 * On systems that only do single-entry ToPA, hitting STOP
-		 * means we are already losing data; need to let the decoder
-		 * know.
-		 */
-		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
-		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-			local_inc(&buf->lost);
-			advance++;
-		}
-	}
-
-	/*
-	 * Also on single-entry ToPA implementations, interrupt will come
-	 * before the output reaches its output region's boundary.
-	 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
-	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
-		void *head = pt_buffer_region(buf);
-
-		/* everything within this margin needs to be zeroed out */
-		memset(head + buf->output_off, 0,
-		       pt_buffer_region_size(buf) -
-		       buf->output_off);
-		advance++;
-	}
-
-	if (advance)
-		pt_buffer_advance(buf);
-
-	wrmsrl(MSR_IA32_RTIT_STATUS, status);
-}
-
-/**
- * pt_read_offset() - translate registers into buffer pointers
- * @buf:	PT buffer.
- *
- * Set buffer's output pointers from MSR values.
- */
-static void pt_read_offset(struct pt_buffer *buf)
-{
-	u64 offset, base_topa;
-
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-	buf->cur = phys_to_virt(base_topa);
-
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
-	/* offset within current output region */
-	buf->output_off = offset >> 32;
-	/* index of current output region within this table */
-	buf->cur_idx = (offset & 0xffffff80) >> 7;
-}
-
-/**
- * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
- * @buf:	PT buffer.
- * @pg:		Page offset in the buffer.
- *
- * When advancing to the next output region (ToPA entry), given a page offset
- * into the buffer, we need to find the offset of the first page in the next
- * region.
- */
-static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
-{
-	struct topa_entry *te = buf->topa_index[pg];
-
-	/* one region */
-	if (buf->first == buf->last && buf->first->last == 1)
-		return pg;
-
-	do {
-		pg++;
-		pg &= buf->nr_pages - 1;
-	} while (buf->topa_index[pg] == te);
-
-	return pg;
-}
-
-/**
- * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
- * @buf:	PT buffer.
- * @handle:	Current output handle.
- *
- * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected and waking up the consumer after a certain fraction of
- * the buffer has filled up. Only needed and sensible for non-snapshot counters.
- *
- * This obviously relies on buf::head to figure out buffer markers, so it has
- * to be called after pt_buffer_reset_offsets() and before the hardware tracing
- * is enabled.
- */
-static int pt_buffer_reset_markers(struct pt_buffer *buf,
-				   struct perf_output_handle *handle)
-
-{
-	unsigned long head = local64_read(&buf->head);
-	unsigned long idx, npages, wakeup;
-
-	/* can't stop in the middle of an output region */
-	if (buf->output_off + handle->size + 1 <
-	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
-		return -EINVAL;
-
-
-	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return 0;
-
-	/* clear STOP and INT from current entry */
-	buf->topa_index[buf->stop_pos]->stop = 0;
-	buf->topa_index[buf->intr_pos]->intr = 0;
-
-	/* how many pages till the STOP marker */
-	npages = handle->size >> PAGE_SHIFT;
-
-	/* if it's on a page boundary, fill up one more page */
-	if (!offset_in_page(head + handle->size + 1))
-		npages++;
-
-	idx = (head >> PAGE_SHIFT) + npages;
-	idx &= buf->nr_pages - 1;
-	buf->stop_pos = idx;
-
-	wakeup = handle->wakeup >> PAGE_SHIFT;
-
-	/* in the worst case, wake up the consumer one page before hard stop */
-	idx = (head >> PAGE_SHIFT) + npages - 1;
-	if (idx > wakeup)
-		idx = wakeup;
-
-	idx &= buf->nr_pages - 1;
-	buf->intr_pos = idx;
-
-	buf->topa_index[buf->stop_pos]->stop = 1;
-	buf->topa_index[buf->intr_pos]->intr = 1;
-
-	return 0;
-}
-
-/**
- * pt_buffer_setup_topa_index() - build topa_index[] table of regions
- * @buf:	PT buffer.
- *
- * topa_index[] references output regions indexed by offset into the
- * buffer for purposes of quick reverse lookup.
- */
-static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
-{
-	struct topa *cur = buf->first, *prev = buf->last;
-	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
-		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
-	int pg = 0, idx = 0;
-
-	while (pg < buf->nr_pages) {
-		int tidx;
-
-		/* pages within one topa entry */
-		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
-			buf->topa_index[pg] = te_prev;
-
-		te_prev = te_cur;
-
-		if (idx == cur->last - 1) {
-			/* advance to next topa table */
-			idx = 0;
-			cur = list_entry(cur->list.next, struct topa, list);
-		} else {
-			idx++;
-		}
-		te_cur = TOPA_ENTRY(cur, idx);
-	}
-
-}
-
-/**
- * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
- * @buf:	PT buffer.
- * @head:	Write pointer (aux_head) from AUX buffer.
- *
- * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly. This is done after we have obtained the
- * current aux_head position from a successful call to perf_aux_output_begin()
- * to make sure the hardware is writing to the right place.
- *
- * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
- * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
- * which are used to determine INT and STOP markers' locations by a subsequent
- * call to pt_buffer_reset_markers().
- */
-static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
-{
-	int pg;
-
-	if (buf->snapshot)
-		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
-
-	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-	pg = pt_topa_next_entry(buf, pg);
-
-	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
-	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
-			(unsigned long)buf->cur) / sizeof(struct topa_entry);
-	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
-
-	local64_set(&buf->head, head);
-	local_set(&buf->data_size, 0);
-}
-
-/**
- * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
- * @buf:	PT buffer.
- */
-static void pt_buffer_fini_topa(struct pt_buffer *buf)
-{
-	struct topa *topa, *iter;
-
-	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
-		/*
-		 * right now, this is in free_aux() path only, so
-		 * no need to unlink this table from the list
-		 */
-		topa_free(topa);
-	}
-}
-
-/**
- * pt_buffer_init_topa() - initialize ToPA table for pt buffer
- * @buf:	PT buffer.
- * @size:	Total size of all regions within this ToPA.
- * @gfp:	Allocation flags.
- */
-static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
-			       gfp_t gfp)
-{
-	struct topa *topa;
-	int err;
-
-	topa = topa_alloc(buf->cpu, gfp);
-	if (!topa)
-		return -ENOMEM;
-
-	topa_insert_table(buf, topa);
-
-	while (buf->nr_pages < nr_pages) {
-		err = topa_insert_pages(buf, gfp);
-		if (err) {
-			pt_buffer_fini_topa(buf);
-			return -ENOMEM;
-		}
-	}
-
-	pt_buffer_setup_topa_index(buf);
-
-	/* link last table to the first one, unless we're double buffering */
-	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
-		TOPA_ENTRY(buf->last, -1)->end = 1;
-	}
-
-	pt_topa_dump(buf);
-	return 0;
-}
-
-/**
- * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:	Cpu on which to allocate, -1 means current.
- * @pages:	Array of pointers to buffer pages passed from perf core.
- * @nr_pages:	Number of pages in the buffer.
- * @snapshot:	If this is a snapshot/overwrite counter.
- *
- * This is a pmu::setup_aux callback that sets up ToPA tables and all the
- * bookkeeping for an AUX buffer.
- *
- * Return:	Our private PT buffer structure.
- */
-static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
-{
-	struct pt_buffer *buf;
-	int node, ret;
-
-	if (!nr_pages)
-		return NULL;
-
-	if (cpu == -1)
-		cpu = raw_smp_processor_id();
-	node = cpu_to_node(cpu);
-
-	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
-			   GFP_KERNEL, node);
-	if (!buf)
-		return NULL;
-
-	buf->cpu = cpu;
-	buf->snapshot = snapshot;
-	buf->data_pages = pages;
-
-	INIT_LIST_HEAD(&buf->tables);
-
-	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
-	if (ret) {
-		kfree(buf);
-		return NULL;
-	}
-
-	return buf;
-}
-
-/**
- * pt_buffer_free_aux() - perf AUX deallocation path callback
- * @data:	PT buffer.
- */
-static void pt_buffer_free_aux(void *data)
-{
-	struct pt_buffer *buf = data;
-
-	pt_buffer_fini_topa(buf);
-	kfree(buf);
-}
-
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:	PT buffer.
- * @pt:		Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
-{
-	if (buf->snapshot)
-		return false;
-
-	if (local_read(&buf->data_size) >= pt->handle.size)
-		return true;
-
-	return false;
-}
-
-/**
- * intel_pt_interrupt() - PT PMI handler
- */
-void intel_pt_interrupt(void)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf;
-	struct perf_event *event = pt->handle.event;
-
-	/*
-	 * There may be a dangling PT bit in the interrupt status register
-	 * after PT has been disabled by pt_event_stop(). Make sure we don't
-	 * do anything (particularly, re-enable) for this event here.
-	 */
-	if (!ACCESS_ONCE(pt->handle_nmi))
-		return;
-
-	pt_config_start(false);
-
-	if (!event)
-		return;
-
-	buf = perf_get_aux(&pt->handle);
-	if (!buf)
-		return;
-
-	pt_read_offset(buf);
-
-	pt_handle_status(pt);
-
-	pt_update_head(pt);
-
-	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-			    local_xchg(&buf->lost, 0));
-
-	if (!event->hw.state) {
-		int ret;
-
-		buf = perf_aux_output_begin(&pt->handle, event);
-		if (!buf) {
-			event->hw.state = PERF_HES_STOPPED;
-			return;
-		}
-
-		pt_buffer_reset_offsets(buf, pt->handle.head);
-		/* snapshot counters don't use PMI, so it's safe */
-		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
-			return;
-		}
-
-		pt_config_buffer(buf->cur->table, buf->cur_idx,
-				 buf->output_off);
-		pt_config(event);
-	}
-}
-
-/*
- * PMU callbacks
- */
-
-static void pt_event_start(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-	if (!buf || pt_buffer_is_full(buf, pt)) {
-		event->hw.state = PERF_HES_STOPPED;
-		return;
-	}
-
-	ACCESS_ONCE(pt->handle_nmi) = 1;
-	event->hw.state = 0;
-
-	pt_config_buffer(buf->cur->table, buf->cur_idx,
-			 buf->output_off);
-	pt_config(event);
-}
-
-static void pt_event_stop(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-	/*
-	 * Protect against the PMI racing with disabling wrmsr,
-	 * see comment in intel_pt_interrupt().
-	 */
-	ACCESS_ONCE(pt->handle_nmi) = 0;
-	pt_config_start(false);
-
-	if (event->hw.state == PERF_HES_STOPPED)
-		return;
-
-	event->hw.state = PERF_HES_STOPPED;
-
-	if (mode & PERF_EF_UPDATE) {
-		struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-		if (!buf)
-			return;
-
-		if (WARN_ON_ONCE(pt->handle.event != event))
-			return;
-
-		pt_read_offset(buf);
-
-		pt_handle_status(pt);
-
-		pt_update_head(pt);
-	}
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf;
-
-	pt_event_stop(event, PERF_EF_UPDATE);
-
-	buf = perf_get_aux(&pt->handle);
-
-	if (buf) {
-		if (buf->snapshot)
-			pt->handle.head =
-				local_xchg(&buf->data_size,
-					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-				    local_xchg(&buf->lost, 0));
-	}
-}
-
-static int pt_event_add(struct perf_event *event, int mode)
-{
-	struct pt_buffer *buf;
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct hw_perf_event *hwc = &event->hw;
-	int ret = -EBUSY;
-
-	if (pt->handle.event)
-		goto fail;
-
-	buf = perf_aux_output_begin(&pt->handle, event);
-	ret = -EINVAL;
-	if (!buf)
-		goto fail_stop;
-
-	pt_buffer_reset_offsets(buf, pt->handle.head);
-	if (!buf->snapshot) {
-		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret)
-			goto fail_end_stop;
-	}
-
-	if (mode & PERF_EF_START) {
-		pt_event_start(event, 0);
-		ret = -EBUSY;
-		if (hwc->state == PERF_HES_STOPPED)
-			goto fail_end_stop;
-	} else {
-		hwc->state = PERF_HES_STOPPED;
-	}
-
-	return 0;
-
-fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-	hwc->state = PERF_HES_STOPPED;
-fail:
-	return ret;
-}
-
-static void pt_event_read(struct perf_event *event)
-{
-}
-
-static void pt_event_destroy(struct perf_event *event)
-{
-	x86_del_exclusive(x86_lbr_exclusive_pt);
-}
-
-static int pt_event_init(struct perf_event *event)
-{
-	if (event->attr.type != pt_pmu.pmu.type)
-		return -ENOENT;
-
-	if (!pt_event_valid(event))
-		return -EINVAL;
-
-	if (x86_add_exclusive(x86_lbr_exclusive_pt))
-		return -EBUSY;
-
-	event->destroy = pt_event_destroy;
-
-	return 0;
-}
-
-void cpu_emergency_stop_pt(void)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-	if (pt->handle.event)
-		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
-}
-
-static __init int pt_init(void)
-{
-	int ret, cpu, prior_warn = 0;
-
-	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
-
-	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-		return -ENODEV;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		u64 ctl;
-
-		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
-		if (!ret && (ctl & RTIT_CTL_TRACEEN))
-			prior_warn++;
-	}
-	put_online_cpus();
-
-	if (prior_warn) {
-		x86_add_exclusive(x86_lbr_exclusive_pt);
-		pr_warn("PT is enabled at boot time, doing nothing\n");
-
-		return -EBUSY;
-	}
-
-	ret = pt_pmu_hw_init();
-	if (ret)
-		return ret;
-
-	if (!pt_cap_get(PT_CAP_topa_output)) {
-		pr_warn("ToPA output is not supported on this CPU\n");
-		return -ENODEV;
-	}
-
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		pt_pmu.pmu.capabilities =
-			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
-
-	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-	pt_pmu.pmu.attr_groups	= pt_attr_groups;
-	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
-	pt_pmu.pmu.event_init	= pt_event_init;
-	pt_pmu.pmu.add		= pt_event_add;
-	pt_pmu.pmu.del		= pt_event_del;
-	pt_pmu.pmu.start	= pt_event_start;
-	pt_pmu.pmu.stop		= pt_event_stop;
-	pt_pmu.pmu.read		= pt_event_read;
-	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
-	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
-	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
-
-	return ret;
-}
-arch_initcall(pt_init);
-- 
cgit 


From 609d809f832ddda20f03029c865dd052596ea394 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:14 +0100
Subject: perf/x86: Move perf_event_intel_rapl.c ....... =>
 x86/events/intel/rapl.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                    |   2 +-
 arch/x86/events/intel/rapl.c                | 783 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                |   1 -
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 783 ----------------------------
 4 files changed, 784 insertions(+), 785 deletions(-)
 create mode 100644 arch/x86/events/intel/rapl.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_rapl.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 2a2f55d7b6f2..f68232c4de81 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
new file mode 100644
index 000000000000..9541f50a534a
--- /dev/null
+++ b/arch/x86/events/intel/rapl.c
@@ -0,0 +1,783 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ * 	  event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *	  event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *	  event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *	  event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
+#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
+#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
+#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
+#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS         0x4
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+	"pp0-core",
+	"package",
+	"dram",
+	"pp1-gpu",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT|\
+			 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK	0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {				\
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
+	.id		= 0,							\
+	.event_str	= str,							\
+};
+
+struct rapl_pmu {
+	spinlock_t	 lock;
+	int		 n_active; /* number of active events */
+	struct list_head active_list;
+	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
+	ktime_t		 timer_interval; /* in ktime_t unit */
+	struct hrtimer   hrtimer;
+};
+
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static struct x86_pmu_quirk *rapl_quirks;
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+	u64 raw;
+	rdmsrl(event->hw.event_base, raw);
+	return raw;
+}
+
+#define rapl_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = rapl_quirks;					\
+	rapl_quirks = &__quirk;						\
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+	if (cfg > NR_RAPL_DOMAINS) {
+		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		return v;
+	}
+	/*
+	 * scale delta to smallest unit (1/2^32)
+	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+	 * or use ldexp(count, -32).
+	 * Watts = Joules/Time delta
+	 */
+	return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta, sdelta;
+	int shift = RAPL_CNTR_WIDTH;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	rdmsrl(event->hw.event_base, new_raw_count);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count) {
+		cpu_relax();
+		goto again;
+	}
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	sdelta = rapl_scale(delta, event->hw.config);
+
+	local64_add(sdelta, &event->count);
+
+	return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+		     HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+	hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct perf_event *event;
+	unsigned long flags;
+
+	if (!pmu->n_active)
+		return HRTIMER_NORESTART;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	list_for_each_entry(event, &pmu->active_list, active_entry) {
+		rapl_event_update(event);
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+	return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+	struct hrtimer *hr = &pmu->hrtimer;
+
+	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+				   struct perf_event *event)
+{
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	list_add_tail(&event->active_entry, &pmu->active_list);
+
+	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+	pmu->n_active++;
+	if (pmu->n_active == 1)
+		rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	__rapl_pmu_event_start(pmu, event);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	/* mark event as deactivated and stopped */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		WARN_ON_ONCE(pmu->n_active <= 0);
+		pmu->n_active--;
+		if (pmu->n_active == 0)
+			rapl_stop_hrtimer(pmu);
+
+		list_del(&event->active_entry);
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	/* check if update of sw counter is necessary */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		rapl_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__rapl_pmu_event_start(pmu, event);
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+	int bit, msr, ret = 0;
+
+	/* only look at RAPL events */
+	if (event->attr.type != rapl_pmu_class.type)
+		return -ENOENT;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~RAPL_EVENT_MASK)
+		return -EINVAL;
+
+	/*
+	 * check event is known (determines counter)
+	 */
+	switch (cfg) {
+	case INTEL_RAPL_PP0:
+		bit = RAPL_IDX_PP0_NRG_STAT;
+		msr = MSR_PP0_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_PKG:
+		bit = RAPL_IDX_PKG_NRG_STAT;
+		msr = MSR_PKG_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_RAM:
+		bit = RAPL_IDX_RAM_NRG_STAT;
+		msr = MSR_DRAM_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_PP1:
+		bit = RAPL_IDX_PP1_NRG_STAT;
+		msr = MSR_PP1_ENERGY_STATUS;
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* check event supported */
+	if (!(rapl_cntr_mask & (1 << bit)))
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/* must be done before validate_group */
+	event->hw.event_base = msr;
+	event->hw.config = cfg;
+	event->hw.idx = bit;
+
+	return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+	.attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+	.name = "events",
+	.attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+	.name = "format",
+	.attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+	&rapl_pmu_attr_group,
+	&rapl_pmu_format_group,
+	&rapl_pmu_events_group,
+	NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+	.attr_groups	= rapl_attr_groups,
+	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
+	.event_init	= rapl_pmu_event_init,
+	.add		= rapl_pmu_event_add, /* must have */
+	.del		= rapl_pmu_event_del, /* must have */
+	.start		= rapl_pmu_event_start,
+	.stop		= rapl_pmu_event_stop,
+	.read		= rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int i, phys_id = topology_physical_package_id(cpu);
+	int target = -1;
+
+	/* find a new cpu on same package */
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+	/*
+	 * clear cpu from cpumask
+	 * if was set in cpumask and still some cpu on package,
+	 * then move to new cpu
+	 */
+	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+		cpumask_set_cpu(target, &rapl_cpu_mask);
+
+	WARN_ON(cpumask_empty(&rapl_cpu_mask));
+	/*
+	 * migrate events and context to new cpu
+	 */
+	if (target >= 0)
+		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+	/* cancel overflow polling timer for CPU */
+	rapl_stop_hrtimer(pmu);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+	int i, phys_id = topology_physical_package_id(cpu);
+
+	/* check if phys_is is already covered */
+	for_each_cpu(i, &rapl_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+	/* was not found, so add it */
+	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int phys_id = topology_physical_package_id(cpu);
+	u64 ms;
+
+	if (pmu)
+		return 0;
+
+	if (phys_id < 0)
+		return -1;
+
+	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+	if (!pmu)
+		return -1;
+	spin_lock_init(&pmu->lock);
+
+	INIT_LIST_HEAD(&pmu->active_list);
+
+	pmu->pmu = &rapl_pmu_class;
+
+	/*
+	 * use reference of 200W for scaling the timeout
+	 * to avoid missing counter overflows.
+	 * 200W = 200 Joules/sec
+	 * divide interval by 2 to avoid lockstep (2 * 100)
+	 * if hw unit is 32, then we use 2 ms 1/200/2
+	 */
+	if (rapl_hw_unit[0] < 32)
+		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
+	else
+		ms = 2;
+
+	pmu->timer_interval = ms_to_ktime(ms);
+
+	rapl_hrtimer_init(pmu);
+
+	/* set RAPL pmu for this cpu for now */
+	per_cpu(rapl_pmu, cpu) = pmu;
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+	return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+	kfree(pmu);
+
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+	if (!pmu)
+		return 0;
+
+	per_cpu(rapl_pmu, cpu) = NULL;
+
+	per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+	return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+			     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		rapl_cpu_prepare(cpu);
+		break;
+	case CPU_STARTING:
+		rapl_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		rapl_cpu_dying(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		rapl_cpu_kfree(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rapl_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int rapl_check_hw_unit(void)
+{
+	u64 msr_rapl_power_unit_bits;
+	int i;
+
+	/* protect rdmsrl() to handle virtualization */
+	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+		return -1;
+	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+	return 0;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+	[1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+	struct rapl_pmu *pmu;
+	int cpu, ret;
+	struct x86_pmu_quirk *quirk;
+	int i;
+
+	/*
+	 * check for Intel processor family 6
+	 */
+	if (!x86_match_cpu(rapl_cpu_match))
+		return 0;
+
+	/* check supported CPU */
+	switch (boot_cpu_data.x86_model) {
+	case 42: /* Sandy Bridge */
+	case 58: /* Ivy Bridge */
+		rapl_cntr_mask = RAPL_IDX_CLN;
+		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+		break;
+	case 63: /* Haswell-Server */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
+	case 60: /* Haswell */
+	case 69: /* Haswell-Celeron */
+	case 61: /* Broadwell */
+		rapl_cntr_mask = RAPL_IDX_HSW;
+		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+		break;
+	case 45: /* Sandy Bridge-EP */
+	case 62: /* IvyTown */
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
+	case 87: /* Knights Landing */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_KNL;
+		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
+
+	default:
+		/* unsupported */
+		return 0;
+	}
+	ret = rapl_check_hw_unit();
+	if (ret)
+		return ret;
+
+	/* run cpu model quirks */
+	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+		quirk->func();
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		ret = rapl_cpu_prepare(cpu);
+		if (ret)
+			goto out;
+		rapl_cpu_init(cpu);
+	}
+
+	__perf_cpu_notifier(rapl_cpu_notifier);
+
+	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+	if (WARN_ON(ret)) {
+		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+		cpu_notifier_register_done();
+		return -1;
+	}
+
+	pmu = __this_cpu_read(rapl_pmu);
+
+	pr_info("RAPL PMU detected,"
+		" API unit is 2^-32 Joules,"
+		" %d fixed counters"
+		" %llu ms ovfl timer\n",
+		hweight32(rapl_cntr_mask),
+		ktime_to_ms(pmu->timer_interval));
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
+out:
+	cpu_notifier_register_done();
+
+	return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 595e3ff6cd1b..018451a4b8f5 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
deleted file mode 100644
index 24a351ad628d..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
- * Copyright (C) 2013 Google, Inc., Stephane Eranian
- *
- * Intel RAPL interface is specified in the IA-32 Manual Vol3b
- * section 14.7.1 (September 2013)
- *
- * RAPL provides more controls than just reporting energy consumption
- * however here we only expose the 3 energy consumption free running
- * counters (pp0, pkg, dram).
- *
- * Each of those counters increments in a power unit defined by the
- * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
- * but it can vary.
- *
- * Counter to rapl events mappings:
- *
- *  pp0 counter: consumption of all physical cores (power plane 0)
- * 	  event: rapl_energy_cores
- *    perf code: 0x1
- *
- *  pkg counter: consumption of the whole processor package
- *	  event: rapl_energy_pkg
- *    perf code: 0x2
- *
- * dram counter: consumption of the dram domain (servers only)
- *	  event: rapl_energy_dram
- *    perf code: 0x3
- *
- * dram counter: consumption of the builtin-gpu domain (client only)
- *	  event: rapl_energy_gpu
- *    perf code: 0x4
- *
- * We manage those counters as free running (read-only). They may be
- * use simultaneously by other tools, such as turbostat.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it does not make sense and is not
- * supported by the RAPL hardware.
- *
- * Because we want to avoid floating-point operations in the kernel,
- * the events are all reported in fixed point arithmetic (32.32).
- * Tools must adjust the counts to convert them to Watts using
- * the duration of the measurement. Tools may use a function such as
- * ldexp(raw_count, -32);
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-/*
- * RAPL energy status counters
- */
-#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
-#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
-#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
-#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
-#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
-#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
-#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
-#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
-
-#define NR_RAPL_DOMAINS         0x4
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
-	"pp0-core",
-	"package",
-	"dram",
-	"pp1-gpu",
-};
-
-/* Clients have PP0, PKG */
-#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM */
-#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM, PP1 */
-#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT|\
-			 1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Knights Landing has PKG, RAM */
-#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT)
-
-/*
- * event code: LSB 8 bits, passed in attr->config
- * any other bit is reserved
- */
-#define RAPL_EVENT_MASK	0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
-static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
-				struct kobj_attribute *attr,	\
-				char *page)			\
-{								\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
-	return sprintf(page, _format "\n");			\
-}								\
-static struct kobj_attribute format_attr_##_var =		\
-	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-
-#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
-static struct perf_pmu_events_attr event_attr_##v = {				\
-	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
-	.id		= 0,							\
-	.event_str	= str,							\
-};
-
-struct rapl_pmu {
-	spinlock_t	 lock;
-	int		 n_active; /* number of active events */
-	struct list_head active_list;
-	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
-	ktime_t		 timer_interval; /* in ktime_t unit */
-	struct hrtimer   hrtimer;
-};
-
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
-static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
-
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
-
-static struct x86_pmu_quirk *rapl_quirks;
-static inline u64 rapl_read_counter(struct perf_event *event)
-{
-	u64 raw;
-	rdmsrl(event->hw.event_base, raw);
-	return raw;
-}
-
-#define rapl_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = rapl_quirks;					\
-	rapl_quirks = &__quirk;						\
-} while (0)
-
-static inline u64 rapl_scale(u64 v, int cfg)
-{
-	if (cfg > NR_RAPL_DOMAINS) {
-		pr_warn("invalid domain %d, failed to scale data\n", cfg);
-		return v;
-	}
-	/*
-	 * scale delta to smallest unit (1/2^32)
-	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
-	 * or use ldexp(count, -32).
-	 * Watts = Joules/Time delta
-	 */
-	return v << (32 - rapl_hw_unit[cfg - 1]);
-}
-
-static u64 rapl_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta, sdelta;
-	int shift = RAPL_CNTR_WIDTH;
-
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	rdmsrl(event->hw.event_base, new_raw_count);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			    new_raw_count) != prev_raw_count) {
-		cpu_relax();
-		goto again;
-	}
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	sdelta = rapl_scale(delta, event->hw.config);
-
-	local64_add(sdelta, &event->count);
-
-	return new_raw_count;
-}
-
-static void rapl_start_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
-		     HRTIMER_MODE_REL_PINNED);
-}
-
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-	hrtimer_cancel(&pmu->hrtimer);
-}
-
-static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct perf_event *event;
-	unsigned long flags;
-
-	if (!pmu->n_active)
-		return HRTIMER_NORESTART;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	list_for_each_entry(event, &pmu->active_list, active_entry) {
-		rapl_event_update(event);
-	}
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-
-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
-
-	return HRTIMER_RESTART;
-}
-
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
-{
-	struct hrtimer *hr = &pmu->hrtimer;
-
-	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hr->function = rapl_hrtimer_handle;
-}
-
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
-				   struct perf_event *event)
-{
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-
-	list_add_tail(&event->active_entry, &pmu->active_list);
-
-	local64_set(&event->hw.prev_count, rapl_read_counter(event));
-
-	pmu->n_active++;
-	if (pmu->n_active == 1)
-		rapl_start_hrtimer(pmu);
-}
-
-static void rapl_pmu_event_start(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-	__rapl_pmu_event_start(pmu, event);
-	spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static void rapl_pmu_event_stop(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	/* mark event as deactivated and stopped */
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		WARN_ON_ONCE(pmu->n_active <= 0);
-		pmu->n_active--;
-		if (pmu->n_active == 0)
-			rapl_stop_hrtimer(pmu);
-
-		list_del(&event->active_entry);
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	/* check if update of sw counter is necessary */
-	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		rapl_event_update(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static int rapl_pmu_event_add(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	if (mode & PERF_EF_START)
-		__rapl_pmu_event_start(pmu, event);
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-
-	return 0;
-}
-
-static void rapl_pmu_event_del(struct perf_event *event, int flags)
-{
-	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int rapl_pmu_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
-	int bit, msr, ret = 0;
-
-	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmu_class.type)
-		return -ENOENT;
-
-	/* check only supported bits are set */
-	if (event->attr.config & ~RAPL_EVENT_MASK)
-		return -EINVAL;
-
-	/*
-	 * check event is known (determines counter)
-	 */
-	switch (cfg) {
-	case INTEL_RAPL_PP0:
-		bit = RAPL_IDX_PP0_NRG_STAT;
-		msr = MSR_PP0_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_PKG:
-		bit = RAPL_IDX_PKG_NRG_STAT;
-		msr = MSR_PKG_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_RAM:
-		bit = RAPL_IDX_RAM_NRG_STAT;
-		msr = MSR_DRAM_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_PP1:
-		bit = RAPL_IDX_PP1_NRG_STAT;
-		msr = MSR_PP1_ENERGY_STATUS;
-		break;
-	default:
-		return -EINVAL;
-	}
-	/* check event supported */
-	if (!(rapl_cntr_mask & (1 << bit)))
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	/* must be done before validate_group */
-	event->hw.event_base = msr;
-	event->hw.config = cfg;
-	event->hw.idx = bit;
-
-	return ret;
-}
-
-static void rapl_pmu_event_read(struct perf_event *event)
-{
-	rapl_event_update(event);
-}
-
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-	.attrs = rapl_pmu_attrs,
-};
-
-RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
-RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
-
-RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
-
-/*
- * we compute in 0.23 nJ increments regardless of MSR
- */
-RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
-
-static struct attribute *rapl_events_srv_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_cln_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_gpu),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_gpu_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_gpu_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_hsw_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_gpu),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_gpu_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_gpu_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_knl_attr[] = {
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_events_group = {
-	.name = "events",
-	.attrs = NULL, /* patched at runtime */
-};
-
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
-static struct attribute *rapl_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_format_group = {
-	.name = "format",
-	.attrs = rapl_formats_attr,
-};
-
-const struct attribute_group *rapl_attr_groups[] = {
-	&rapl_pmu_attr_group,
-	&rapl_pmu_format_group,
-	&rapl_pmu_events_group,
-	NULL,
-};
-
-static struct pmu rapl_pmu_class = {
-	.attr_groups	= rapl_attr_groups,
-	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
-	.event_init	= rapl_pmu_event_init,
-	.add		= rapl_pmu_event_add, /* must have */
-	.del		= rapl_pmu_event_del, /* must have */
-	.start		= rapl_pmu_event_start,
-	.stop		= rapl_pmu_event_stop,
-	.read		= rapl_pmu_event_read,
-};
-
-static void rapl_cpu_exit(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int i, phys_id = topology_physical_package_id(cpu);
-	int target = -1;
-
-	/* find a new cpu on same package */
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-	/*
-	 * clear cpu from cpumask
-	 * if was set in cpumask and still some cpu on package,
-	 * then move to new cpu
-	 */
-	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-		cpumask_set_cpu(target, &rapl_cpu_mask);
-
-	WARN_ON(cpumask_empty(&rapl_cpu_mask));
-	/*
-	 * migrate events and context to new cpu
-	 */
-	if (target >= 0)
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
-
-	/* cancel overflow polling timer for CPU */
-	rapl_stop_hrtimer(pmu);
-}
-
-static void rapl_cpu_init(int cpu)
-{
-	int i, phys_id = topology_physical_package_id(cpu);
-
-	/* check if phys_is is already covered */
-	for_each_cpu(i, &rapl_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-	/* was not found, so add it */
-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_cpu_prepare(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int phys_id = topology_physical_package_id(cpu);
-	u64 ms;
-
-	if (pmu)
-		return 0;
-
-	if (phys_id < 0)
-		return -1;
-
-	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-	if (!pmu)
-		return -1;
-	spin_lock_init(&pmu->lock);
-
-	INIT_LIST_HEAD(&pmu->active_list);
-
-	pmu->pmu = &rapl_pmu_class;
-
-	/*
-	 * use reference of 200W for scaling the timeout
-	 * to avoid missing counter overflows.
-	 * 200W = 200 Joules/sec
-	 * divide interval by 2 to avoid lockstep (2 * 100)
-	 * if hw unit is 32, then we use 2 ms 1/200/2
-	 */
-	if (rapl_hw_unit[0] < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-	else
-		ms = 2;
-
-	pmu->timer_interval = ms_to_ktime(ms);
-
-	rapl_hrtimer_init(pmu);
-
-	/* set RAPL pmu for this cpu for now */
-	per_cpu(rapl_pmu, cpu) = pmu;
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-	return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-	kfree(pmu);
-
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-	if (!pmu)
-		return 0;
-
-	per_cpu(rapl_pmu, cpu) = NULL;
-
-	per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
-	return 0;
-}
-
-static int rapl_cpu_notifier(struct notifier_block *self,
-			     unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		rapl_cpu_prepare(cpu);
-		break;
-	case CPU_STARTING:
-		rapl_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		rapl_cpu_dying(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		rapl_cpu_kfree(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		rapl_cpu_exit(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static int rapl_check_hw_unit(void)
-{
-	u64 msr_rapl_power_unit_bits;
-	int i;
-
-	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-		return -1;
-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
-		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
-
-	return 0;
-}
-
-static const struct x86_cpu_id rapl_cpu_match[] = {
-	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-	[1] = {},
-};
-
-static int __init rapl_pmu_init(void)
-{
-	struct rapl_pmu *pmu;
-	int cpu, ret;
-	struct x86_pmu_quirk *quirk;
-	int i;
-
-	/*
-	 * check for Intel processor family 6
-	 */
-	if (!x86_match_cpu(rapl_cpu_match))
-		return 0;
-
-	/* check supported CPU */
-	switch (boot_cpu_data.x86_model) {
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-		rapl_cntr_mask = RAPL_IDX_CLN;
-		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-		break;
-	case 63: /* Haswell-Server */
-		rapl_add_quirk(rapl_hsw_server_quirk);
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell-Celeron */
-	case 61: /* Broadwell */
-		rapl_cntr_mask = RAPL_IDX_HSW;
-		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-		break;
-	case 45: /* Sandy Bridge-EP */
-	case 62: /* IvyTown */
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 87: /* Knights Landing */
-		rapl_add_quirk(rapl_hsw_server_quirk);
-		rapl_cntr_mask = RAPL_IDX_KNL;
-		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
-	default:
-		/* unsupported */
-		return 0;
-	}
-	ret = rapl_check_hw_unit();
-	if (ret)
-		return ret;
-
-	/* run cpu model quirks */
-	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-		quirk->func();
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		ret = rapl_cpu_prepare(cpu);
-		if (ret)
-			goto out;
-		rapl_cpu_init(cpu);
-	}
-
-	__perf_cpu_notifier(rapl_cpu_notifier);
-
-	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-	if (WARN_ON(ret)) {
-		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-		cpu_notifier_register_done();
-		return -1;
-	}
-
-	pmu = __this_cpu_read(rapl_pmu);
-
-	pr_info("RAPL PMU detected,"
-		" API unit is 2^-32 Joules,"
-		" %d fixed counters"
-		" %llu ms ovfl timer\n",
-		hweight32(rapl_cntr_mask),
-		ktime_to_ms(pmu->timer_interval));
-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-		if (rapl_cntr_mask & (1 << i)) {
-			pr_info("hw unit of domain %s 2^-%d Joules\n",
-				rapl_domain_names[i], rapl_hw_unit[i]);
-		}
-	}
-out:
-	cpu_notifier_register_done();
-
-	return 0;
-}
-device_initcall(rapl_pmu_init);
-- 
cgit 


From 6bcb2db547be8263a98ae9413127df9385b38763 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:15 +0100
Subject: perf/x86: Move perf_event_intel_uncore.[ch] .. =>
 x86/events/intel/uncore.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                           |    1 +
 arch/x86/events/intel/uncore.c                     | 1401 ++++++++++++++++++++
 arch/x86/events/intel/uncore.h                     |  357 +++++
 arch/x86/kernel/cpu/Makefile                       |    3 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore.c      | 1401 --------------------
 arch/x86/kernel/cpu/perf_event_intel_uncore.h      |  357 -----
 .../x86/kernel/cpu/perf_event_intel_uncore_nhmex.c |    2 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  |    2 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c |    3 +-
 9 files changed, 1763 insertions(+), 1764 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore.c
 create mode 100644 arch/x86/events/intel/uncore.h
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.h

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f68232c4de81..aae3e530e2ee 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -8,3 +8,4 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
new file mode 100644
index 000000000000..91a18d6c4405
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1401 @@
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+
+static DEFINE_RAW_SPINLOCK(uncore_box_lock);
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+	EVENT_CONSTRAINT(0, 0, 0);
+
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+	struct pci2phy_map *map;
+	int phys_id = -1;
+
+	raw_spin_lock(&pci2phy_map_lock);
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == pci_domain_nr(bus)) {
+			phys_id = map->pbus_to_physid[bus->number];
+			break;
+		}
+	}
+	raw_spin_unlock(&pci2phy_map_lock);
+
+	return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+	struct pci2phy_map *map, *alloc = NULL;
+	int i;
+
+	lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == segment)
+			goto end;
+	}
+
+	if (!alloc) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+		raw_spin_lock(&pci2phy_map_lock);
+
+		if (!alloc)
+			return NULL;
+
+		goto lookup;
+	}
+
+	map = alloc;
+	alloc = NULL;
+	map->segment = segment;
+	for (i = 0; i < 256; i++)
+		map->pbus_to_physid[i] = -1;
+	list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+	kfree(alloc);
+	return map;
+}
+
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf)
+{
+	struct uncore_event_desc *event =
+		container_of(attr, struct uncore_event_desc, attr);
+	return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+	struct intel_uncore_box *box;
+
+	box = *per_cpu_ptr(pmu->box, cpu);
+	if (box)
+		return box;
+
+	raw_spin_lock(&uncore_box_lock);
+	/* Recheck in lock to handle races. */
+	if (*per_cpu_ptr(pmu->box, cpu))
+		goto out;
+	list_for_each_entry(box, &pmu->box_list, list) {
+		if (box->phys_id == topology_physical_package_id(cpu)) {
+			atomic_inc(&box->refcnt);
+			*per_cpu_ptr(pmu->box, cpu) = box;
+			break;
+		}
+	}
+out:
+	raw_spin_unlock(&uncore_box_lock);
+
+	return *per_cpu_ptr(pmu->box, cpu);
+}
+
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+	/*
+	 * perf core schedules event on the basis of cpu, uncore events are
+	 * collected by one of the cpus inside a physical package.
+	 */
+	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 count;
+
+	rdmsrl(event->hw.event_base, count);
+
+	return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	unsigned long flags;
+	bool ok = false;
+
+	/*
+	 * reg->alloc can be set due to existing state, so for fake box we
+	 * need to ignore this, otherwise we might fail to allocate proper
+	 * fake state for this extra reg constraint.
+	 */
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+
+	er = &box->shared_regs[reg1->idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!atomic_read(&er->ref) ||
+	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
+		atomic_inc(&er->ref);
+		er->config1 = reg1->config;
+		er->config2 = reg2->config;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (ok) {
+		if (!uncore_box_is_fake(box))
+			reg1->alloc = 1;
+		return NULL;
+	}
+
+	return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+	/*
+	 * Only put constraint if extra reg was actually allocated. Also
+	 * takes care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake box we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent box
+	 * state either since it will be thrown out.
+	 */
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	er = &box->shared_regs[reg1->idx];
+	atomic_dec(&er->ref);
+	reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	er = &box->shared_regs[idx];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->last_tag = ++box->tags[idx];
+
+	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+		hwc->event_base = uncore_fixed_ctr(box);
+		hwc->config_base = uncore_fixed_ctl(box);
+		return;
+	}
+
+	hwc->config_base = uncore_event_ctl(box, hwc->idx);
+	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 prev_count, new_count, delta;
+	int shift;
+
+	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+		shift = 64 - uncore_fixed_ctr_bits(box);
+	else
+		shift = 64 - uncore_perf_ctr_bits(box);
+
+	/* the hrtimer might modify the previous event value */
+again:
+	prev_count = local64_read(&event->hw.prev_count);
+	new_count = uncore_read_counter(box, event);
+	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+		goto again;
+
+	delta = (new_count << shift) - (prev_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+	struct intel_uncore_box *box;
+	struct perf_event *event;
+	unsigned long flags;
+	int bit;
+
+	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+	if (!box->n_active || box->cpu != smp_processor_id())
+		return HRTIMER_NORESTART;
+	/*
+	 * disable local interrupt to prevent uncore_pmu_event_start/stop
+	 * to interrupt the update process
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * handle boxes with an active event list as opposed to active
+	 * counters
+	 */
+	list_for_each_entry(event, &box->active_list, active_entry) {
+		uncore_perf_event_update(box, event);
+	}
+
+	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+		uncore_perf_event_update(box, box->events[bit]);
+
+	local_irq_restore(flags);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+	return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+{
+	struct intel_uncore_box *box;
+	int i, size;
+
+	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+
+	box = kzalloc_node(size, GFP_KERNEL, node);
+	if (!box)
+		return NULL;
+
+	for (i = 0; i < type->num_shared_regs; i++)
+		raw_spin_lock_init(&box->shared_regs[i].lock);
+
+	uncore_pmu_init_hrtimer(box);
+	atomic_set(&box->refcnt, 1);
+	box->cpu = -1;
+	box->phys_id = -1;
+
+	/* set default hrtimer timeout */
+	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+	INIT_LIST_HEAD(&box->active_list);
+
+	return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+	return event->pmu->event_init == uncore_pmu_event_init;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = box->pmu->type->num_counters;
+	if (box->pmu->type->fixed_ctl)
+		max_count++;
+
+	if (box->n_events >= max_count)
+		return -EINVAL;
+
+	n = box->n_events;
+
+	if (is_uncore_event(leader)) {
+		box->event_list[n] = leader;
+		n++;
+	}
+
+	if (!dogrp)
+		return n;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+		if (!is_uncore_event(event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		box->event_list[n] = event;
+		n++;
+	}
+	return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct event_constraint *c;
+
+	if (type->ops->get_constraint) {
+		c = type->ops->get_constraint(box, event);
+		if (c)
+			return c;
+	}
+
+	if (event->attr.config == UNCORE_FIXED_EVENT)
+		return &uncore_constraint_fixed;
+
+	if (type->constraints) {
+		for_each_event_constraint(c, type->constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	if (box->pmu->type->ops->put_constraint)
+		box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	struct event_constraint *c;
+	int i, wmin, wmax, ret = 0;
+	struct hw_perf_event *hwc;
+
+	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		c = uncore_get_event_constraint(box, box->event_list[i]);
+		box->event_constraint[i] = c;
+		wmin = min(wmin, c->weight);
+		wmax = max(wmax, c->weight);
+	}
+
+	/* fastpath, try to reuse previous register */
+	for (i = 0; i < n; i++) {
+		hwc = &box->event_list[i]->hw;
+		c = box->event_constraint[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		/* not already used */
+		if (test_bit(hwc->idx, used_mask))
+			break;
+
+		__set_bit(hwc->idx, used_mask);
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+	/* slow path */
+	if (i != n)
+		ret = perf_assign_events(box->event_constraint, n,
+					 wmin, wmax, n, assign);
+
+	if (!assign || ret) {
+		for (i = 0; i < n; i++)
+			uncore_put_event_constraint(box, box->event_list[i]);
+	}
+	return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+		return;
+
+	event->hw.state = 0;
+	box->events[idx] = event;
+	box->n_active++;
+	__set_bit(idx, box->active_mask);
+
+	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+	uncore_enable_event(box, event);
+
+	if (box->n_active == 1) {
+		uncore_enable_box(box);
+		uncore_pmu_start_hrtimer(box);
+	}
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+		uncore_disable_event(box, event);
+		box->n_active--;
+		box->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		if (box->n_active == 0) {
+			uncore_disable_box(box);
+			uncore_pmu_cancel_hrtimer(box);
+		}
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+	int assign[UNCORE_PMC_IDX_MAX];
+	int i, n, ret;
+
+	if (!box)
+		return -ENODEV;
+
+	ret = n = uncore_collect_events(box, event, false);
+	if (ret < 0)
+		return ret;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	ret = uncore_assign_events(box, assign, n);
+	if (ret)
+		return ret;
+
+	/* save events moving to new counters */
+	for (i = 0; i < box->n_events; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx == assign[i] &&
+			hwc->last_tag == box->tags[assign[i]])
+			continue;
+		/*
+		 * Ensure we don't accidentally enable a stopped
+		 * counter simply because we rescheduled.
+		 */
+		if (hwc->state & PERF_HES_STOPPED)
+			hwc->state |= PERF_HES_ARCH;
+
+		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+	}
+
+	/* reprogram moved events into new counters */
+	for (i = 0; i < n; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx != assign[i] ||
+			hwc->last_tag != box->tags[assign[i]])
+			uncore_assign_hw_event(box, event, assign[i]);
+		else if (i < box->n_events)
+			continue;
+
+		if (hwc->state & PERF_HES_ARCH)
+			continue;
+
+		uncore_pmu_event_start(event, 0);
+	}
+	box->n_events = n;
+
+	return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			uncore_put_event_constraint(box, event);
+
+			while (++i < box->n_events)
+				box->event_list[i - 1] = box->event_list[i];
+
+			--box->n_events;
+			break;
+		}
+	}
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+				struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct intel_uncore_box *fake_box;
+	int ret = -EINVAL, n;
+
+	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+	if (!fake_box)
+		return -ENOMEM;
+
+	fake_box->pmu = pmu;
+	/*
+	 * the event is not yet connected with its
+	 * siblings therefore we must first collect
+	 * existing siblings, then add the new event
+	 * before we can simulate the scheduling
+	 */
+	n = uncore_collect_events(fake_box, leader, true);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+	n = uncore_collect_events(fake_box, event, false);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+
+	ret = uncore_assign_events(fake_box, NULL, n);
+out:
+	kfree(fake_box);
+	return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	int ret;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/*
+	 * Uncore PMU does measure at all privilege level all the time.
+	 * So it doesn't make sense to specify any exclude bits.
+	 */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+			event->attr.exclude_hv || event->attr.exclude_idle)
+		return -EINVAL;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+	if (event->attr.config == UNCORE_FIXED_EVENT) {
+		/* no fixed counter */
+		if (!pmu->type->fixed_ctl)
+			return -EINVAL;
+		/*
+		 * if there is only one fixed counter, only the first pmu
+		 * can access the fixed counter
+		 */
+		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+			return -EINVAL;
+
+		/* fixed counters have event field hardcoded to zero */
+		hwc->config = 0ULL;
+	} else {
+		hwc->config = event->attr.config & pmu->type->event_mask;
+		if (pmu->type->ops->hw_config) {
+			ret = pmu->type->ops->hw_config(box, event);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (event->group_leader != event)
+		ret = uncore_validate_group(pmu, event);
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+	.attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+	int ret;
+
+	if (!pmu->type->pmu) {
+		pmu->pmu = (struct pmu) {
+			.attr_groups	= pmu->type->attr_groups,
+			.task_ctx_nr	= perf_invalid_context,
+			.event_init	= uncore_pmu_event_init,
+			.add		= uncore_pmu_event_add,
+			.del		= uncore_pmu_event_del,
+			.start		= uncore_pmu_event_start,
+			.stop		= uncore_pmu_event_stop,
+			.read		= uncore_pmu_event_read,
+		};
+	} else {
+		pmu->pmu = *pmu->type->pmu;
+		pmu->pmu.attr_groups = pmu->type->attr_groups;
+	}
+
+	if (pmu->type->num_boxes == 1) {
+		if (strlen(pmu->type->name) > 0)
+			sprintf(pmu->name, "uncore_%s", pmu->type->name);
+		else
+			sprintf(pmu->name, "uncore");
+	} else {
+		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+			pmu->pmu_idx);
+	}
+
+	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+	return ret;
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+	int i;
+
+	for (i = 0; i < type->num_boxes; i++)
+		free_percpu(type->pmus[i].box);
+	kfree(type->pmus);
+	type->pmus = NULL;
+	kfree(type->events_group);
+	type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+	int i;
+	for (i = 0; types[i]; i++)
+		uncore_type_exit(types[i]);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type)
+{
+	struct intel_uncore_pmu *pmus;
+	struct attribute_group *attr_group;
+	struct attribute **attrs;
+	int i, j;
+
+	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+	if (!pmus)
+		return -ENOMEM;
+
+	type->pmus = pmus;
+
+	type->unconstrainted = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+				0, type->num_counters, 0, 0);
+
+	for (i = 0; i < type->num_boxes; i++) {
+		pmus[i].func_id = -1;
+		pmus[i].pmu_idx = i;
+		pmus[i].type = type;
+		INIT_LIST_HEAD(&pmus[i].box_list);
+		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
+		if (!pmus[i].box)
+			goto fail;
+	}
+
+	if (type->event_descs) {
+		i = 0;
+		while (type->event_descs[i].attr.attr.name)
+			i++;
+
+		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+					sizeof(*attr_group), GFP_KERNEL);
+		if (!attr_group)
+			goto fail;
+
+		attrs = (struct attribute **)(attr_group + 1);
+		attr_group->name = "events";
+		attr_group->attrs = attrs;
+
+		for (j = 0; j < i; j++)
+			attrs[j] = &type->event_descs[j].attr.attr;
+
+		type->events_group = attr_group;
+	}
+
+	type->pmu_group = &uncore_pmu_attr_group;
+	return 0;
+fail:
+	uncore_type_exit(type);
+	return -ENOMEM;
+}
+
+static int __init uncore_types_init(struct intel_uncore_type **types)
+{
+	int i, ret;
+
+	for (i = 0; types[i]; i++) {
+		ret = uncore_type_init(types[i]);
+		if (ret)
+			goto fail;
+	}
+	return 0;
+fail:
+	while (--i >= 0)
+		uncore_type_exit(types[i]);
+	return ret;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct intel_uncore_type *type;
+	int phys_id;
+	bool first_box = false;
+
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	if (phys_id < 0)
+		return -ENODEV;
+
+	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+		uncore_extra_pci_dev[phys_id][idx] = pdev;
+		pci_set_drvdata(pdev, NULL);
+		return 0;
+	}
+
+	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+	box = uncore_alloc_box(type, NUMA_NO_NODE);
+	if (!box)
+		return -ENOMEM;
+
+	/*
+	 * for performance monitoring unit with multiple boxes,
+	 * each box has a different function id.
+	 */
+	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+	/* Knights Landing uses a common PCI device ID for multiple instances of
+	 * an uncore PMU device type. There is only one entry per device type in
+	 * the knl_uncore_pci_ids table inspite of multiple devices present for
+	 * some device types. Hence PCI device idx would be 0 for all devices.
+	 * So increment pmu pointer to point to an unused array element.
+	 */
+	if (boot_cpu_data.x86_model == 87)
+		while (pmu->func_id >= 0)
+			pmu++;
+	if (pmu->func_id < 0)
+		pmu->func_id = pdev->devfn;
+	else
+		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+	box->phys_id = phys_id;
+	box->pci_dev = pdev;
+	box->pmu = pmu;
+	uncore_box_init(box);
+	pci_set_drvdata(pdev, box);
+
+	raw_spin_lock(&uncore_box_lock);
+	if (list_empty(&pmu->box_list))
+		first_box = true;
+	list_add_tail(&box->list, &pmu->box_list);
+	raw_spin_unlock(&uncore_box_lock);
+
+	if (first_box)
+		uncore_pmu_register(pmu);
+	return 0;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+	struct intel_uncore_box *box = pci_get_drvdata(pdev);
+	struct intel_uncore_pmu *pmu;
+	int i, cpu, phys_id;
+	bool last_box = false;
+
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	box = pci_get_drvdata(pdev);
+	if (!box) {
+		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
+				uncore_extra_pci_dev[phys_id][i] = NULL;
+				break;
+			}
+		}
+		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+		return;
+	}
+
+	pmu = box->pmu;
+	if (WARN_ON_ONCE(phys_id != box->phys_id))
+		return;
+
+	pci_set_drvdata(pdev, NULL);
+
+	raw_spin_lock(&uncore_box_lock);
+	list_del(&box->list);
+	if (list_empty(&pmu->box_list))
+		last_box = true;
+	raw_spin_unlock(&uncore_box_lock);
+
+	for_each_possible_cpu(cpu) {
+		if (*per_cpu_ptr(pmu->box, cpu) == box) {
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			atomic_dec(&box->refcnt);
+		}
+	}
+
+	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	kfree(box);
+
+	if (last_box)
+		perf_pmu_unregister(&pmu->pmu);
+}
+
+static int __init uncore_pci_init(void)
+{
+	int ret;
+
+	switch (boot_cpu_data.x86_model) {
+	case 45: /* Sandy Bridge-EP */
+		ret = snbep_uncore_pci_init();
+		break;
+	case 62: /* Ivy Bridge-EP */
+		ret = ivbep_uncore_pci_init();
+		break;
+	case 63: /* Haswell-EP */
+		ret = hswep_uncore_pci_init();
+		break;
+	case 79: /* BDX-EP */
+	case 86: /* BDX-DE */
+		ret = bdx_uncore_pci_init();
+		break;
+	case 42: /* Sandy Bridge */
+		ret = snb_uncore_pci_init();
+		break;
+	case 58: /* Ivy Bridge */
+		ret = ivb_uncore_pci_init();
+		break;
+	case 60: /* Haswell */
+	case 69: /* Haswell Celeron */
+		ret = hsw_uncore_pci_init();
+		break;
+	case 61: /* Broadwell */
+		ret = bdw_uncore_pci_init();
+		break;
+	case 87: /* Knights Landing */
+		ret = knl_uncore_pci_init();
+		break;
+	case 94: /* SkyLake */
+		ret = skl_uncore_pci_init();
+		break;
+	default:
+		return 0;
+	}
+
+	if (ret)
+		return ret;
+
+	ret = uncore_types_init(uncore_pci_uncores);
+	if (ret)
+		return ret;
+
+	uncore_pci_driver->probe = uncore_pci_probe;
+	uncore_pci_driver->remove = uncore_pci_remove;
+
+	ret = pci_register_driver(uncore_pci_driver);
+	if (ret == 0)
+		pcidrv_registered = true;
+	else
+		uncore_types_exit(uncore_pci_uncores);
+
+	return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+	if (pcidrv_registered) {
+		pcidrv_registered = false;
+		pci_unregister_driver(uncore_pci_driver);
+		uncore_types_exit(uncore_pci_uncores);
+	}
+}
+
+/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
+static LIST_HEAD(boxes_to_free);
+
+static void uncore_kfree_boxes(void)
+{
+	struct intel_uncore_box *box;
+
+	while (!list_empty(&boxes_to_free)) {
+		box = list_entry(boxes_to_free.next,
+				 struct intel_uncore_box, list);
+		list_del(&box->list);
+		kfree(box);
+	}
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			if (box && atomic_dec_and_test(&box->refcnt))
+				list_add(&box->list, &boxes_to_free);
+		}
+	}
+}
+
+static int uncore_cpu_starting(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box, *exist;
+	int i, j, k, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			/* called by uncore_cpu_init? */
+			if (box && box->phys_id >= 0) {
+				uncore_box_init(box);
+				continue;
+			}
+
+			for_each_online_cpu(k) {
+				exist = *per_cpu_ptr(pmu->box, k);
+				if (exist && exist->phys_id == phys_id) {
+					atomic_inc(&exist->refcnt);
+					*per_cpu_ptr(pmu->box, cpu) = exist;
+					if (box) {
+						list_add(&box->list,
+							 &boxes_to_free);
+						box = NULL;
+					}
+					break;
+				}
+			}
+
+			if (box) {
+				box->phys_id = phys_id;
+				uncore_box_init(box);
+			}
+		}
+	}
+	return 0;
+}
+
+static int uncore_cpu_prepare(int cpu, int phys_id)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (pmu->func_id < 0)
+				pmu->func_id = j;
+
+			box = uncore_alloc_box(type, cpu_to_node(cpu));
+			if (!box)
+				return -ENOMEM;
+
+			box->pmu = pmu;
+			box->phys_id = phys_id;
+			*per_cpu_ptr(pmu->box, cpu) = box;
+		}
+	}
+	return 0;
+}
+
+static void
+uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncores[i]; i++) {
+		type = uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (old_cpu < 0)
+				box = uncore_pmu_to_box(pmu, new_cpu);
+			else
+				box = uncore_pmu_to_box(pmu, old_cpu);
+			if (!box)
+				continue;
+
+			if (old_cpu < 0) {
+				WARN_ON_ONCE(box->cpu != -1);
+				box->cpu = new_cpu;
+				continue;
+			}
+
+			WARN_ON_ONCE(box->cpu != old_cpu);
+			if (new_cpu >= 0) {
+				uncore_pmu_cancel_hrtimer(box);
+				perf_pmu_migrate_context(&pmu->pmu,
+						old_cpu, new_cpu);
+				box->cpu = new_cpu;
+			} else {
+				box->cpu = -1;
+			}
+		}
+	}
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+	int i, phys_id, target;
+
+	/* if exiting cpu is used for collecting uncore events */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return;
+
+	/* find a new cpu to collect uncore events */
+	phys_id = topology_physical_package_id(cpu);
+	target = -1;
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+
+	/* migrate uncore events to the new cpu */
+	if (target >= 0)
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+
+	uncore_change_context(uncore_msr_uncores, cpu, target);
+	uncore_change_context(uncore_pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+	int i, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+	for_each_cpu(i, &uncore_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+	uncore_change_context(uncore_msr_uncores, -1, cpu);
+	uncore_change_context(uncore_pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+			       unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	/* allocate/free data structure for uncore box */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		uncore_cpu_prepare(cpu, -1);
+		break;
+	case CPU_STARTING:
+		uncore_cpu_starting(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		uncore_cpu_dying(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		uncore_kfree_boxes();
+		break;
+	default:
+		break;
+	}
+
+	/* select the cpu that collects uncore events */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_STARTING:
+		uncore_event_init_cpu(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		uncore_event_exit_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+	.notifier_call	= uncore_cpu_notifier,
+	/*
+	 * to migrate uncore events, our notifier should be executed
+	 * before perf core's notifier.
+	 */
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+	uncore_cpu_starting(smp_processor_id());
+}
+
+static int __init uncore_cpu_init(void)
+{
+	int ret;
+
+	switch (boot_cpu_data.x86_model) {
+	case 26: /* Nehalem */
+	case 30:
+	case 37: /* Westmere */
+	case 44:
+		nhm_uncore_cpu_init();
+		break;
+	case 42: /* Sandy Bridge */
+	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+	case 69: /* Haswell */
+	case 70: /* Haswell */
+	case 61: /* Broadwell */
+	case 71: /* Broadwell */
+		snb_uncore_cpu_init();
+		break;
+	case 45: /* Sandy Bridge-EP */
+		snbep_uncore_cpu_init();
+		break;
+	case 46: /* Nehalem-EX */
+	case 47: /* Westmere-EX aka. Xeon E7 */
+		nhmex_uncore_cpu_init();
+		break;
+	case 62: /* Ivy Bridge-EP */
+		ivbep_uncore_cpu_init();
+		break;
+	case 63: /* Haswell-EP */
+		hswep_uncore_cpu_init();
+		break;
+	case 79: /* BDX-EP */
+	case 86: /* BDX-DE */
+		bdx_uncore_cpu_init();
+		break;
+	case 87: /* Knights Landing */
+		knl_uncore_cpu_init();
+		break;
+	default:
+		return 0;
+	}
+
+	ret = uncore_types_init(uncore_msr_uncores);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __init uncore_pmus_register(void)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_type *type;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			uncore_pmu_register(pmu);
+		}
+	}
+
+	return 0;
+}
+
+static void __init uncore_cpumask_init(void)
+{
+	int cpu;
+
+	/*
+	 * ony invoke once from msr or pci init code
+	 */
+	if (!cpumask_empty(&uncore_cpu_mask))
+		return;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		int i, phys_id = topology_physical_package_id(cpu);
+
+		for_each_cpu(i, &uncore_cpu_mask) {
+			if (phys_id == topology_physical_package_id(i)) {
+				phys_id = -1;
+				break;
+			}
+		}
+		if (phys_id < 0)
+			continue;
+
+		uncore_cpu_prepare(cpu, phys_id);
+		uncore_event_init_cpu(cpu);
+	}
+	on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+	__register_cpu_notifier(&uncore_cpu_nb);
+
+	cpu_notifier_register_done();
+}
+
+
+static int __init intel_uncore_init(void)
+{
+	int ret;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return -ENODEV;
+
+	if (cpu_has_hypervisor)
+		return -ENODEV;
+
+	ret = uncore_pci_init();
+	if (ret)
+		goto fail;
+	ret = uncore_cpu_init();
+	if (ret) {
+		uncore_pci_exit();
+		goto fail;
+	}
+	uncore_cpumask_init();
+
+	uncore_pmus_register();
+	return 0;
+fail:
+	return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
new file mode 100644
index 000000000000..1dea2046990a
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,357 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN		32
+#define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT		0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC	8
+#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx)	((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data)	((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data)	(data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV		0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX	3
+
+/* support up to 8 sockets */
+#define UNCORE_SOCKET_MAX		8
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+	const char *name;
+	int num_counters;
+	int num_boxes;
+	int perf_ctr_bits;
+	int fixed_ctr_bits;
+	unsigned perf_ctr;
+	unsigned event_ctl;
+	unsigned event_mask;
+	unsigned fixed_ctr;
+	unsigned fixed_ctl;
+	unsigned box_ctl;
+	unsigned msr_offset;
+	unsigned num_shared_regs:8;
+	unsigned single_fixed:1;
+	unsigned pair_ctr_ctl:1;
+	unsigned *msr_offsets;
+	struct event_constraint unconstrainted;
+	struct event_constraint *constraints;
+	struct intel_uncore_pmu *pmus;
+	struct intel_uncore_ops *ops;
+	struct uncore_event_desc *event_descs;
+	const struct attribute_group *attr_groups[4];
+	struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+	void (*init_box)(struct intel_uncore_box *);
+	void (*disable_box)(struct intel_uncore_box *);
+	void (*enable_box)(struct intel_uncore_box *);
+	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+	void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+	int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+	struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+						   struct perf_event *);
+	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+	struct pmu pmu;
+	char name[UNCORE_PMU_NAME_LEN];
+	int pmu_idx;
+	int func_id;
+	struct intel_uncore_type *type;
+	struct intel_uncore_box ** __percpu box;
+	struct list_head box_list;
+};
+
+struct intel_uncore_extra_reg {
+	raw_spinlock_t lock;
+	u64 config, config1, config2;
+	atomic_t ref;
+};
+
+struct intel_uncore_box {
+	int phys_id;
+	int n_active;	/* number of active events */
+	int n_events;
+	int cpu;	/* cpu to collect events */
+	unsigned long flags;
+	atomic_t refcnt;
+	struct perf_event *events[UNCORE_PMC_IDX_MAX];
+	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	u64 tags[UNCORE_PMC_IDX_MAX];
+	struct pci_dev *pci_dev;
+	struct intel_uncore_pmu *pmu;
+	u64 hrtimer_duration; /* hrtimer timeout for this box */
+	struct hrtimer hrtimer;
+	struct list_head list;
+	struct list_head active_list;
+	void *io_addr;
+	struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED	0
+
+struct uncore_event_desc {
+	struct kobj_attribute attr;
+	const char *config;
+};
+
+struct pci2phy_map {
+	struct list_head list;
+	int segment;
+	int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
+{								\
+	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
+	.config	= _config,					\
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_##_var =			\
+	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+	struct intel_uncore_pmu *pmu = box->pmu;
+	return pmu->type->msr_offsets ?
+		pmu->type->msr_offsets[pmu->pmu_idx] :
+		pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->box_ctl)
+		return 0;
+	return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->fixed_ctl)
+		return 0;
+	return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->event_ctl +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->perf_ctr +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctl(box);
+	else
+		return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctr(box);
+	else
+		return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_event_ctl(box, idx);
+	else
+		return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_perf_ctr(box, idx);
+	else
+		return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+	return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->disable_box)
+		box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->enable_box)
+		box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->init_box)
+			box->pmu->type->ops->init_box(box);
+	}
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+	return (box->phys_id < 0);
+}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 018451a4b8f5..875d1b1f5e32 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,8 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
-					   perf_event_intel_uncore_snb.o \
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
 					   perf_event_intel_uncore_snbep.o \
 					   perf_event_intel_uncore_nhmex.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
deleted file mode 100644
index 3bf41d413775..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,1401 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
-struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-
-static bool pcidrv_registered;
-struct pci_driver *uncore_pci_driver;
-/* pci bus to socket mapping */
-DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
-struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint uncore_constraint_fixed =
-	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-struct event_constraint uncore_constraint_empty =
-	EVENT_CONSTRAINT(0, 0, 0);
-
-int uncore_pcibus_to_physid(struct pci_bus *bus)
-{
-	struct pci2phy_map *map;
-	int phys_id = -1;
-
-	raw_spin_lock(&pci2phy_map_lock);
-	list_for_each_entry(map, &pci2phy_map_head, list) {
-		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
-			break;
-		}
-	}
-	raw_spin_unlock(&pci2phy_map_lock);
-
-	return phys_id;
-}
-
-struct pci2phy_map *__find_pci2phy_map(int segment)
-{
-	struct pci2phy_map *map, *alloc = NULL;
-	int i;
-
-	lockdep_assert_held(&pci2phy_map_lock);
-
-lookup:
-	list_for_each_entry(map, &pci2phy_map_head, list) {
-		if (map->segment == segment)
-			goto end;
-	}
-
-	if (!alloc) {
-		raw_spin_unlock(&pci2phy_map_lock);
-		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
-		raw_spin_lock(&pci2phy_map_lock);
-
-		if (!alloc)
-			return NULL;
-
-		goto lookup;
-	}
-
-	map = alloc;
-	alloc = NULL;
-	map->segment = segment;
-	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
-	list_add_tail(&map->list, &pci2phy_map_head);
-
-end:
-	kfree(alloc);
-	return map;
-}
-
-ssize_t uncore_event_show(struct kobject *kobj,
-			  struct kobj_attribute *attr, char *buf)
-{
-	struct uncore_event_desc *event =
-		container_of(attr, struct uncore_event_desc, attr);
-	return sprintf(buf, "%s", event->config);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
-	struct intel_uncore_box *box;
-
-	box = *per_cpu_ptr(pmu->box, cpu);
-	if (box)
-		return box;
-
-	raw_spin_lock(&uncore_box_lock);
-	/* Recheck in lock to handle races. */
-	if (*per_cpu_ptr(pmu->box, cpu))
-		goto out;
-	list_for_each_entry(box, &pmu->box_list, list) {
-		if (box->phys_id == topology_physical_package_id(cpu)) {
-			atomic_inc(&box->refcnt);
-			*per_cpu_ptr(pmu->box, cpu) = box;
-			break;
-		}
-	}
-out:
-	raw_spin_unlock(&uncore_box_lock);
-
-	return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-	/*
-	 * perf core schedules event on the basis of cpu, uncore events are
-	 * collected by one of the cpus inside a physical package.
-	 */
-	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	u64 count;
-
-	rdmsrl(event->hw.event_base, count);
-
-	return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	unsigned long flags;
-	bool ok = false;
-
-	/*
-	 * reg->alloc can be set due to existing state, so for fake box we
-	 * need to ignore this, otherwise we might fail to allocate proper
-	 * fake state for this extra reg constraint.
-	 */
-	if (reg1->idx == EXTRA_REG_NONE ||
-	    (!uncore_box_is_fake(box) && reg1->alloc))
-		return NULL;
-
-	er = &box->shared_regs[reg1->idx];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (!atomic_read(&er->ref) ||
-	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
-		atomic_inc(&er->ref);
-		er->config1 = reg1->config;
-		er->config2 = reg2->config;
-		ok = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (ok) {
-		if (!uncore_box_is_fake(box))
-			reg1->alloc = 1;
-		return NULL;
-	}
-
-	return &uncore_constraint_empty;
-}
-
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
-	/*
-	 * Only put constraint if extra reg was actually allocated. Also
-	 * takes care of event which do not use an extra shared reg.
-	 *
-	 * Also, if this is a fake box we shouldn't touch any event state
-	 * (reg->alloc) and we don't care about leaving inconsistent box
-	 * state either since it will be thrown out.
-	 */
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	er = &box->shared_regs[reg1->idx];
-	atomic_dec(&er->ref);
-	reg1->alloc = 0;
-}
-
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	u64 config;
-
-	er = &box->shared_regs[idx];
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	config = er->config;
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	return config;
-}
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	hwc->idx = idx;
-	hwc->last_tag = ++box->tags[idx];
-
-	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
-		hwc->event_base = uncore_fixed_ctr(box);
-		hwc->config_base = uncore_fixed_ctl(box);
-		return;
-	}
-
-	hwc->config_base = uncore_event_ctl(box, hwc->idx);
-	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
-}
-
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
-	u64 prev_count, new_count, delta;
-	int shift;
-
-	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
-		shift = 64 - uncore_fixed_ctr_bits(box);
-	else
-		shift = 64 - uncore_perf_ctr_bits(box);
-
-	/* the hrtimer might modify the previous event value */
-again:
-	prev_count = local64_read(&event->hw.prev_count);
-	new_count = uncore_read_counter(box, event);
-	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
-		goto again;
-
-	delta = (new_count << shift) - (prev_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
-	struct intel_uncore_box *box;
-	struct perf_event *event;
-	unsigned long flags;
-	int bit;
-
-	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
-	if (!box->n_active || box->cpu != smp_processor_id())
-		return HRTIMER_NORESTART;
-	/*
-	 * disable local interrupt to prevent uncore_pmu_event_start/stop
-	 * to interrupt the update process
-	 */
-	local_irq_save(flags);
-
-	/*
-	 * handle boxes with an active event list as opposed to active
-	 * counters
-	 */
-	list_for_each_entry(event, &box->active_list, active_entry) {
-		uncore_perf_event_update(box, event);
-	}
-
-	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
-		uncore_perf_event_update(box, box->events[bit]);
-
-	local_irq_restore(flags);
-
-	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
-	return HRTIMER_RESTART;
-}
-
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
-		      HRTIMER_MODE_REL_PINNED);
-}
-
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
-{
-	struct intel_uncore_box *box;
-	int i, size;
-
-	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
-	box = kzalloc_node(size, GFP_KERNEL, node);
-	if (!box)
-		return NULL;
-
-	for (i = 0; i < type->num_shared_regs; i++)
-		raw_spin_lock_init(&box->shared_regs[i].lock);
-
-	uncore_pmu_init_hrtimer(box);
-	atomic_set(&box->refcnt, 1);
-	box->cpu = -1;
-	box->phys_id = -1;
-
-	/* set default hrtimer timeout */
-	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-
-	INIT_LIST_HEAD(&box->active_list);
-
-	return box;
-}
-
-/*
- * Using uncore_pmu_event_init pmu event_init callback
- * as a detection point for uncore events.
- */
-static int uncore_pmu_event_init(struct perf_event *event);
-
-static bool is_uncore_event(struct perf_event *event)
-{
-	return event->pmu->event_init == uncore_pmu_event_init;
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
-	struct perf_event *event;
-	int n, max_count;
-
-	max_count = box->pmu->type->num_counters;
-	if (box->pmu->type->fixed_ctl)
-		max_count++;
-
-	if (box->n_events >= max_count)
-		return -EINVAL;
-
-	n = box->n_events;
-
-	if (is_uncore_event(leader)) {
-		box->event_list[n] = leader;
-		n++;
-	}
-
-	if (!dogrp)
-		return n;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
-		if (!is_uncore_event(event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-
-		if (n >= max_count)
-			return -EINVAL;
-
-		box->event_list[n] = event;
-		n++;
-	}
-	return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_type *type = box->pmu->type;
-	struct event_constraint *c;
-
-	if (type->ops->get_constraint) {
-		c = type->ops->get_constraint(box, event);
-		if (c)
-			return c;
-	}
-
-	if (event->attr.config == UNCORE_FIXED_EVENT)
-		return &uncore_constraint_fixed;
-
-	if (type->constraints) {
-		for_each_event_constraint(c, type->constraints) {
-			if ((event->hw.config & c->cmask) == c->code)
-				return c;
-		}
-	}
-
-	return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	if (box->pmu->type->ops->put_constraint)
-		box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
-	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-	struct event_constraint *c;
-	int i, wmin, wmax, ret = 0;
-	struct hw_perf_event *hwc;
-
-	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
-	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		c = uncore_get_event_constraint(box, box->event_list[i]);
-		box->event_constraint[i] = c;
-		wmin = min(wmin, c->weight);
-		wmax = max(wmax, c->weight);
-	}
-
-	/* fastpath, try to reuse previous register */
-	for (i = 0; i < n; i++) {
-		hwc = &box->event_list[i]->hw;
-		c = box->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		/* not already used */
-		if (test_bit(hwc->idx, used_mask))
-			break;
-
-		__set_bit(hwc->idx, used_mask);
-		if (assign)
-			assign[i] = hwc->idx;
-	}
-	/* slow path */
-	if (i != n)
-		ret = perf_assign_events(box->event_constraint, n,
-					 wmin, wmax, n, assign);
-
-	if (!assign || ret) {
-		for (i = 0; i < n; i++)
-			uncore_put_event_constraint(box, box->event_list[i]);
-	}
-	return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int idx = event->hw.idx;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
-		return;
-
-	event->hw.state = 0;
-	box->events[idx] = event;
-	box->n_active++;
-	__set_bit(idx, box->active_mask);
-
-	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
-	uncore_enable_event(box, event);
-
-	if (box->n_active == 1) {
-		uncore_enable_box(box);
-		uncore_pmu_start_hrtimer(box);
-	}
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
-		uncore_disable_event(box, event);
-		box->n_active--;
-		box->events[hwc->idx] = NULL;
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		if (box->n_active == 0) {
-			uncore_disable_box(box);
-			uncore_pmu_cancel_hrtimer(box);
-		}
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-	int assign[UNCORE_PMC_IDX_MAX];
-	int i, n, ret;
-
-	if (!box)
-		return -ENODEV;
-
-	ret = n = uncore_collect_events(box, event, false);
-	if (ret < 0)
-		return ret;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	ret = uncore_assign_events(box, assign, n);
-	if (ret)
-		return ret;
-
-	/* save events moving to new counters */
-	for (i = 0; i < box->n_events; i++) {
-		event = box->event_list[i];
-		hwc = &event->hw;
-
-		if (hwc->idx == assign[i] &&
-			hwc->last_tag == box->tags[assign[i]])
-			continue;
-		/*
-		 * Ensure we don't accidentally enable a stopped
-		 * counter simply because we rescheduled.
-		 */
-		if (hwc->state & PERF_HES_STOPPED)
-			hwc->state |= PERF_HES_ARCH;
-
-		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-	}
-
-	/* reprogram moved events into new counters */
-	for (i = 0; i < n; i++) {
-		event = box->event_list[i];
-		hwc = &event->hw;
-
-		if (hwc->idx != assign[i] ||
-			hwc->last_tag != box->tags[assign[i]])
-			uncore_assign_hw_event(box, event, assign[i]);
-		else if (i < box->n_events)
-			continue;
-
-		if (hwc->state & PERF_HES_ARCH)
-			continue;
-
-		uncore_pmu_event_start(event, 0);
-	}
-	box->n_events = n;
-
-	return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int i;
-
-	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < box->n_events; i++) {
-		if (event == box->event_list[i]) {
-			uncore_put_event_constraint(box, event);
-
-			while (++i < box->n_events)
-				box->event_list[i - 1] = box->event_list[i];
-
-			--box->n_events;
-			break;
-		}
-	}
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-}
-
-void uncore_pmu_event_read(struct perf_event *event)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
-				struct perf_event *event)
-{
-	struct perf_event *leader = event->group_leader;
-	struct intel_uncore_box *fake_box;
-	int ret = -EINVAL, n;
-
-	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
-	if (!fake_box)
-		return -ENOMEM;
-
-	fake_box->pmu = pmu;
-	/*
-	 * the event is not yet connected with its
-	 * siblings therefore we must first collect
-	 * existing siblings, then add the new event
-	 * before we can simulate the scheduling
-	 */
-	n = uncore_collect_events(fake_box, leader, true);
-	if (n < 0)
-		goto out;
-
-	fake_box->n_events = n;
-	n = uncore_collect_events(fake_box, event, false);
-	if (n < 0)
-		goto out;
-
-	fake_box->n_events = n;
-
-	ret = uncore_assign_events(fake_box, NULL, n);
-out:
-	kfree(fake_box);
-	return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct hw_perf_event *hwc = &event->hw;
-	int ret;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	pmu = uncore_event_to_pmu(event);
-	/* no device found for this pmu */
-	if (pmu->func_id < 0)
-		return -ENOENT;
-
-	/*
-	 * Uncore PMU does measure at all privilege level all the time.
-	 * So it doesn't make sense to specify any exclude bits.
-	 */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-			event->attr.exclude_hv || event->attr.exclude_idle)
-		return -EINVAL;
-
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
-	/*
-	 * Place all uncore events for a particular physical package
-	 * onto a single cpu
-	 */
-	if (event->cpu < 0)
-		return -EINVAL;
-	box = uncore_pmu_to_box(pmu, event->cpu);
-	if (!box || box->cpu < 0)
-		return -EINVAL;
-	event->cpu = box->cpu;
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-	if (event->attr.config == UNCORE_FIXED_EVENT) {
-		/* no fixed counter */
-		if (!pmu->type->fixed_ctl)
-			return -EINVAL;
-		/*
-		 * if there is only one fixed counter, only the first pmu
-		 * can access the fixed counter
-		 */
-		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
-			return -EINVAL;
-
-		/* fixed counters have event field hardcoded to zero */
-		hwc->config = 0ULL;
-	} else {
-		hwc->config = event->attr.config & pmu->type->event_mask;
-		if (pmu->type->ops->hw_config) {
-			ret = pmu->type->ops->hw_config(box, event);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if (event->group_leader != event)
-		ret = uncore_validate_group(pmu, event);
-	else
-		ret = 0;
-
-	return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
-	.attrs = uncore_pmu_attrs,
-};
-
-static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
-	int ret;
-
-	if (!pmu->type->pmu) {
-		pmu->pmu = (struct pmu) {
-			.attr_groups	= pmu->type->attr_groups,
-			.task_ctx_nr	= perf_invalid_context,
-			.event_init	= uncore_pmu_event_init,
-			.add		= uncore_pmu_event_add,
-			.del		= uncore_pmu_event_del,
-			.start		= uncore_pmu_event_start,
-			.stop		= uncore_pmu_event_stop,
-			.read		= uncore_pmu_event_read,
-		};
-	} else {
-		pmu->pmu = *pmu->type->pmu;
-		pmu->pmu.attr_groups = pmu->type->attr_groups;
-	}
-
-	if (pmu->type->num_boxes == 1) {
-		if (strlen(pmu->type->name) > 0)
-			sprintf(pmu->name, "uncore_%s", pmu->type->name);
-		else
-			sprintf(pmu->name, "uncore");
-	} else {
-		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-			pmu->pmu_idx);
-	}
-
-	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
-	return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
-	int i;
-
-	for (i = 0; i < type->num_boxes; i++)
-		free_percpu(type->pmus[i].box);
-	kfree(type->pmus);
-	type->pmus = NULL;
-	kfree(type->events_group);
-	type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
-	int i;
-	for (i = 0; types[i]; i++)
-		uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
-	struct intel_uncore_pmu *pmus;
-	struct attribute_group *attr_group;
-	struct attribute **attrs;
-	int i, j;
-
-	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
-	if (!pmus)
-		return -ENOMEM;
-
-	type->pmus = pmus;
-
-	type->unconstrainted = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-				0, type->num_counters, 0, 0);
-
-	for (i = 0; i < type->num_boxes; i++) {
-		pmus[i].func_id = -1;
-		pmus[i].pmu_idx = i;
-		pmus[i].type = type;
-		INIT_LIST_HEAD(&pmus[i].box_list);
-		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-		if (!pmus[i].box)
-			goto fail;
-	}
-
-	if (type->event_descs) {
-		i = 0;
-		while (type->event_descs[i].attr.attr.name)
-			i++;
-
-		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
-					sizeof(*attr_group), GFP_KERNEL);
-		if (!attr_group)
-			goto fail;
-
-		attrs = (struct attribute **)(attr_group + 1);
-		attr_group->name = "events";
-		attr_group->attrs = attrs;
-
-		for (j = 0; j < i; j++)
-			attrs[j] = &type->event_descs[j].attr.attr;
-
-		type->events_group = attr_group;
-	}
-
-	type->pmu_group = &uncore_pmu_attr_group;
-	return 0;
-fail:
-	uncore_type_exit(type);
-	return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
-	int i, ret;
-
-	for (i = 0; types[i]; i++) {
-		ret = uncore_type_init(types[i]);
-		if (ret)
-			goto fail;
-	}
-	return 0;
-fail:
-	while (--i >= 0)
-		uncore_type_exit(types[i]);
-	return ret;
-}
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct intel_uncore_type *type;
-	int phys_id;
-	bool first_box = false;
-
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
-		return -ENODEV;
-
-	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-		uncore_extra_pci_dev[phys_id][idx] = pdev;
-		pci_set_drvdata(pdev, NULL);
-		return 0;
-	}
-
-	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-	box = uncore_alloc_box(type, NUMA_NO_NODE);
-	if (!box)
-		return -ENOMEM;
-
-	/*
-	 * for performance monitoring unit with multiple boxes,
-	 * each box has a different function id.
-	 */
-	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-	/* Knights Landing uses a common PCI device ID for multiple instances of
-	 * an uncore PMU device type. There is only one entry per device type in
-	 * the knl_uncore_pci_ids table inspite of multiple devices present for
-	 * some device types. Hence PCI device idx would be 0 for all devices.
-	 * So increment pmu pointer to point to an unused array element.
-	 */
-	if (boot_cpu_data.x86_model == 87)
-		while (pmu->func_id >= 0)
-			pmu++;
-	if (pmu->func_id < 0)
-		pmu->func_id = pdev->devfn;
-	else
-		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-	box->phys_id = phys_id;
-	box->pci_dev = pdev;
-	box->pmu = pmu;
-	uncore_box_init(box);
-	pci_set_drvdata(pdev, box);
-
-	raw_spin_lock(&uncore_box_lock);
-	if (list_empty(&pmu->box_list))
-		first_box = true;
-	list_add_tail(&box->list, &pmu->box_list);
-	raw_spin_unlock(&uncore_box_lock);
-
-	if (first_box)
-		uncore_pmu_register(pmu);
-	return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
-	struct intel_uncore_box *box = pci_get_drvdata(pdev);
-	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id;
-	bool last_box = false;
-
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	box = pci_get_drvdata(pdev);
-	if (!box) {
-		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-				uncore_extra_pci_dev[phys_id][i] = NULL;
-				break;
-			}
-		}
-		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
-		return;
-	}
-
-	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->phys_id))
-		return;
-
-	pci_set_drvdata(pdev, NULL);
-
-	raw_spin_lock(&uncore_box_lock);
-	list_del(&box->list);
-	if (list_empty(&pmu->box_list))
-		last_box = true;
-	raw_spin_unlock(&uncore_box_lock);
-
-	for_each_possible_cpu(cpu) {
-		if (*per_cpu_ptr(pmu->box, cpu) == box) {
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			atomic_dec(&box->refcnt);
-		}
-	}
-
-	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
-	kfree(box);
-
-	if (last_box)
-		perf_pmu_unregister(&pmu->pmu);
-}
-
-static int __init uncore_pci_init(void)
-{
-	int ret;
-
-	switch (boot_cpu_data.x86_model) {
-	case 45: /* Sandy Bridge-EP */
-		ret = snbep_uncore_pci_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ret = ivbep_uncore_pci_init();
-		break;
-	case 63: /* Haswell-EP */
-		ret = hswep_uncore_pci_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		ret = bdx_uncore_pci_init();
-		break;
-	case 42: /* Sandy Bridge */
-		ret = snb_uncore_pci_init();
-		break;
-	case 58: /* Ivy Bridge */
-		ret = ivb_uncore_pci_init();
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell Celeron */
-		ret = hsw_uncore_pci_init();
-		break;
-	case 61: /* Broadwell */
-		ret = bdw_uncore_pci_init();
-		break;
-	case 87: /* Knights Landing */
-		ret = knl_uncore_pci_init();
-		break;
-	case 94: /* SkyLake */
-		ret = skl_uncore_pci_init();
-		break;
-	default:
-		return 0;
-	}
-
-	if (ret)
-		return ret;
-
-	ret = uncore_types_init(uncore_pci_uncores);
-	if (ret)
-		return ret;
-
-	uncore_pci_driver->probe = uncore_pci_probe;
-	uncore_pci_driver->remove = uncore_pci_remove;
-
-	ret = pci_register_driver(uncore_pci_driver);
-	if (ret == 0)
-		pcidrv_registered = true;
-	else
-		uncore_types_exit(uncore_pci_uncores);
-
-	return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
-	if (pcidrv_registered) {
-		pcidrv_registered = false;
-		pci_unregister_driver(uncore_pci_driver);
-		uncore_types_exit(uncore_pci_uncores);
-	}
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-	struct intel_uncore_box *box;
-
-	while (!list_empty(&boxes_to_free)) {
-		box = list_entry(boxes_to_free.next,
-				 struct intel_uncore_box, list);
-		list_del(&box->list);
-		kfree(box);
-	}
-}
-
-static void uncore_cpu_dying(int cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt))
-				list_add(&box->list, &boxes_to_free);
-		}
-	}
-}
-
-static int uncore_cpu_starting(int cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box, *exist;
-	int i, j, k, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			/* called by uncore_cpu_init? */
-			if (box && box->phys_id >= 0) {
-				uncore_box_init(box);
-				continue;
-			}
-
-			for_each_online_cpu(k) {
-				exist = *per_cpu_ptr(pmu->box, k);
-				if (exist && exist->phys_id == phys_id) {
-					atomic_inc(&exist->refcnt);
-					*per_cpu_ptr(pmu->box, cpu) = exist;
-					if (box) {
-						list_add(&box->list,
-							 &boxes_to_free);
-						box = NULL;
-					}
-					break;
-				}
-			}
-
-			if (box) {
-				box->phys_id = phys_id;
-				uncore_box_init(box);
-			}
-		}
-	}
-	return 0;
-}
-
-static int uncore_cpu_prepare(int cpu, int phys_id)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (pmu->func_id < 0)
-				pmu->func_id = j;
-
-			box = uncore_alloc_box(type, cpu_to_node(cpu));
-			if (!box)
-				return -ENOMEM;
-
-			box->pmu = pmu;
-			box->phys_id = phys_id;
-			*per_cpu_ptr(pmu->box, cpu) = box;
-		}
-	}
-	return 0;
-}
-
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncores[i]; i++) {
-		type = uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (old_cpu < 0)
-				box = uncore_pmu_to_box(pmu, new_cpu);
-			else
-				box = uncore_pmu_to_box(pmu, old_cpu);
-			if (!box)
-				continue;
-
-			if (old_cpu < 0) {
-				WARN_ON_ONCE(box->cpu != -1);
-				box->cpu = new_cpu;
-				continue;
-			}
-
-			WARN_ON_ONCE(box->cpu != old_cpu);
-			if (new_cpu >= 0) {
-				uncore_pmu_cancel_hrtimer(box);
-				perf_pmu_migrate_context(&pmu->pmu,
-						old_cpu, new_cpu);
-				box->cpu = new_cpu;
-			} else {
-				box->cpu = -1;
-			}
-		}
-	}
-}
-
-static void uncore_event_exit_cpu(int cpu)
-{
-	int i, phys_id, target;
-
-	/* if exiting cpu is used for collecting uncore events */
-	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-		return;
-
-	/* find a new cpu to collect uncore events */
-	phys_id = topology_physical_package_id(cpu);
-	target = -1;
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-
-	/* migrate uncore events to the new cpu */
-	if (target >= 0)
-		cpumask_set_cpu(target, &uncore_cpu_mask);
-
-	uncore_change_context(uncore_msr_uncores, cpu, target);
-	uncore_change_context(uncore_pci_uncores, cpu, target);
-}
-
-static void uncore_event_init_cpu(int cpu)
-{
-	int i, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-	for_each_cpu(i, &uncore_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-
-	cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
-	uncore_change_context(uncore_msr_uncores, -1, cpu);
-	uncore_change_context(uncore_pci_uncores, -1, cpu);
-}
-
-static int uncore_cpu_notifier(struct notifier_block *self,
-			       unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	/* allocate/free data structure for uncore box */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		uncore_cpu_prepare(cpu, -1);
-		break;
-	case CPU_STARTING:
-		uncore_cpu_starting(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		uncore_cpu_dying(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		uncore_kfree_boxes();
-		break;
-	default:
-		break;
-	}
-
-	/* select the cpu that collects uncore events */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-	case CPU_STARTING:
-		uncore_event_init_cpu(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		uncore_event_exit_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb = {
-	.notifier_call	= uncore_cpu_notifier,
-	/*
-	 * to migrate uncore events, our notifier should be executed
-	 * before perf core's notifier.
-	 */
-	.priority	= CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
-	uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
-	int ret;
-
-	switch (boot_cpu_data.x86_model) {
-	case 26: /* Nehalem */
-	case 30:
-	case 37: /* Westmere */
-	case 44:
-		nhm_uncore_cpu_init();
-		break;
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-	case 60: /* Haswell */
-	case 69: /* Haswell */
-	case 70: /* Haswell */
-	case 61: /* Broadwell */
-	case 71: /* Broadwell */
-		snb_uncore_cpu_init();
-		break;
-	case 45: /* Sandy Bridge-EP */
-		snbep_uncore_cpu_init();
-		break;
-	case 46: /* Nehalem-EX */
-	case 47: /* Westmere-EX aka. Xeon E7 */
-		nhmex_uncore_cpu_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ivbep_uncore_cpu_init();
-		break;
-	case 63: /* Haswell-EP */
-		hswep_uncore_cpu_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		bdx_uncore_cpu_init();
-		break;
-	case 87: /* Knights Landing */
-		knl_uncore_cpu_init();
-		break;
-	default:
-		return 0;
-	}
-
-	ret = uncore_types_init(uncore_msr_uncores);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_type *type;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			uncore_pmu_register(pmu);
-		}
-	}
-
-	return 0;
-}
-
-static void __init uncore_cpumask_init(void)
-{
-	int cpu;
-
-	/*
-	 * ony invoke once from msr or pci init code
-	 */
-	if (!cpumask_empty(&uncore_cpu_mask))
-		return;
-
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		int i, phys_id = topology_physical_package_id(cpu);
-
-		for_each_cpu(i, &uncore_cpu_mask) {
-			if (phys_id == topology_physical_package_id(i)) {
-				phys_id = -1;
-				break;
-			}
-		}
-		if (phys_id < 0)
-			continue;
-
-		uncore_cpu_prepare(cpu, phys_id);
-		uncore_event_init_cpu(cpu);
-	}
-	on_each_cpu(uncore_cpu_setup, NULL, 1);
-
-	__register_cpu_notifier(&uncore_cpu_nb);
-
-	cpu_notifier_register_done();
-}
-
-
-static int __init intel_uncore_init(void)
-{
-	int ret;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-		return -ENODEV;
-
-	if (cpu_has_hypervisor)
-		return -ENODEV;
-
-	ret = uncore_pci_init();
-	if (ret)
-		goto fail;
-	ret = uncore_cpu_init();
-	if (ret) {
-		uncore_pci_exit();
-		goto fail;
-	}
-	uncore_cpumask_init();
-
-	uncore_pmus_register();
-	return 0;
-fail:
-	return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
deleted file mode 100644
index a7086b862156..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN		32
-#define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
-#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT		0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC	8
-#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_PCI_DEV_DATA(type, idx)	((type << 8) | idx)
-#define UNCORE_PCI_DEV_TYPE(data)	((data >> 8) & 0xff)
-#define UNCORE_PCI_DEV_IDX(data)	(data & 0xff)
-#define UNCORE_EXTRA_PCI_DEV		0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX	3
-
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX		8
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
-	const char *name;
-	int num_counters;
-	int num_boxes;
-	int perf_ctr_bits;
-	int fixed_ctr_bits;
-	unsigned perf_ctr;
-	unsigned event_ctl;
-	unsigned event_mask;
-	unsigned fixed_ctr;
-	unsigned fixed_ctl;
-	unsigned box_ctl;
-	unsigned msr_offset;
-	unsigned num_shared_regs:8;
-	unsigned single_fixed:1;
-	unsigned pair_ctr_ctl:1;
-	unsigned *msr_offsets;
-	struct event_constraint unconstrainted;
-	struct event_constraint *constraints;
-	struct intel_uncore_pmu *pmus;
-	struct intel_uncore_ops *ops;
-	struct uncore_event_desc *event_descs;
-	const struct attribute_group *attr_groups[4];
-	struct pmu *pmu; /* for custom pmu ops */
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
-	void (*init_box)(struct intel_uncore_box *);
-	void (*disable_box)(struct intel_uncore_box *);
-	void (*enable_box)(struct intel_uncore_box *);
-	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
-	void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
-	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
-	int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
-	struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
-						   struct perf_event *);
-	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
-	struct pmu pmu;
-	char name[UNCORE_PMU_NAME_LEN];
-	int pmu_idx;
-	int func_id;
-	struct intel_uncore_type *type;
-	struct intel_uncore_box ** __percpu box;
-	struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
-	raw_spinlock_t lock;
-	u64 config, config1, config2;
-	atomic_t ref;
-};
-
-struct intel_uncore_box {
-	int phys_id;
-	int n_active;	/* number of active events */
-	int n_events;
-	int cpu;	/* cpu to collect events */
-	unsigned long flags;
-	atomic_t refcnt;
-	struct perf_event *events[UNCORE_PMC_IDX_MAX];
-	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
-	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
-	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-	u64 tags[UNCORE_PMC_IDX_MAX];
-	struct pci_dev *pci_dev;
-	struct intel_uncore_pmu *pmu;
-	u64 hrtimer_duration; /* hrtimer timeout for this box */
-	struct hrtimer hrtimer;
-	struct list_head list;
-	struct list_head active_list;
-	void *io_addr;
-	struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED	0
-
-struct uncore_event_desc {
-	struct kobj_attribute attr;
-	const char *config;
-};
-
-struct pci2phy_map {
-	struct list_head list;
-	int segment;
-	int pbus_to_physid[256];
-};
-
-int uncore_pcibus_to_physid(struct pci_bus *bus);
-struct pci2phy_map *__find_pci2phy_map(int segment);
-
-ssize_t uncore_event_show(struct kobject *kobj,
-			  struct kobj_attribute *attr, char *buf);
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
-{								\
-	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
-	.config	= _config,					\
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\
-static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
-				struct kobj_attribute *attr,		\
-				char *page)				\
-{									\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
-	return sprintf(page, _format "\n");				\
-}									\
-static struct kobj_attribute format_attr_##_var =			\
-	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
-	return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
-	struct intel_uncore_pmu *pmu = box->pmu;
-	return pmu->type->msr_offsets ?
-		pmu->type->msr_offsets[pmu->pmu_idx] :
-		pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
-	if (!box->pmu->type->box_ctl)
-		return 0;
-	return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
-	if (!box->pmu->type->fixed_ctl)
-		return 0;
-	return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	return box->pmu->type->event_ctl +
-		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-		uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	return box->pmu->type->perf_ctr +
-		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-		uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
-	if (box->pci_dev)
-		return uncore_pci_fixed_ctl(box);
-	else
-		return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
-	if (box->pci_dev)
-		return uncore_pci_fixed_ctr(box);
-	else
-		return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	if (box->pci_dev)
-		return uncore_pci_event_ctl(box, idx);
-	else
-		return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	if (box->pci_dev)
-		return uncore_pci_perf_ctr(box, idx);
-	else
-		return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
-	return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
-	return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->type->ops->disable_box)
-		box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->type->ops->enable_box)
-		box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-		if (box->pmu->type->ops->init_box)
-			box->pmu->type->ops->init_box(box);
-	}
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
-	return (box->phys_id < 0);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_event_read(struct perf_event *event);
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
-
-extern struct intel_uncore_type **uncore_msr_uncores;
-extern struct intel_uncore_type **uncore_pci_uncores;
-extern struct pci_driver *uncore_pci_driver;
-extern raw_spinlock_t pci2phy_map_lock;
-extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-extern struct event_constraint uncore_constraint_empty;
-
-/* perf_event_intel_uncore_snb.c */
-int snb_uncore_pci_init(void);
-int ivb_uncore_pci_init(void);
-int hsw_uncore_pci_init(void);
-int bdw_uncore_pci_init(void);
-int skl_uncore_pci_init(void);
-void snb_uncore_cpu_init(void);
-void nhm_uncore_cpu_init(void);
-int snb_pci2phy_map_init(int devid);
-
-/* perf_event_intel_uncore_snbep.c */
-int snbep_uncore_pci_init(void);
-void snbep_uncore_cpu_init(void);
-int ivbep_uncore_pci_init(void);
-void ivbep_uncore_cpu_init(void);
-int hswep_uncore_pci_init(void);
-void hswep_uncore_cpu_init(void);
-int bdx_uncore_pci_init(void);
-void bdx_uncore_cpu_init(void);
-int knl_uncore_pci_init(void);
-void knl_uncore_cpu_init(void);
-
-/* perf_event_intel_uncore_nhmex.c */
-void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
index 2749965afed0..d70cfe0be86a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
@@ -1,5 +1,5 @@
 /* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
+#include "../../events/intel/uncore.h"
 
 /* NHM-EX event control */
 #define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 2bd030ddd0db..e0e41f55a20c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,5 +1,5 @@
 /* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
+#include "../../events/intel/uncore.h"
 
 /* Uncore IMC PCI IDs */
 #define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 33acb884ccf1..188e18a1f224 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1,6 +1,5 @@
 /* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
+#include "../../events/intel/uncore.h"
 
 /* SNB-EP Box level control */
 #define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
-- 
cgit 


From 35bf705c25e875f40ff8ed6af415315335973977 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:16 +0100
Subject: perf/x86: Move perf_event_intel_uncore_nhmex.c =>
 x86/events/intel/uncore_nmhex.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                           |    2 +-
 arch/x86/events/intel/uncore_nhmex.c               | 1221 ++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                       |    3 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_nhmex.c | 1221 --------------------
 4 files changed, 1223 insertions(+), 1224 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore_nhmex.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index aae3e530e2ee..70aedfea45b4 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -8,4 +8,4 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
new file mode 100644
index 000000000000..e89bf5c536e0
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1221 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
+#define NHMEX_PMON_CTL_INVERT		(1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
+					 NHMEX_PMON_CTL_UMASK_MASK | \
+					 NHMEX_PMON_CTL_EDGE_DET | \
+					 NHMEX_PMON_CTL_INVERT | \
+					 NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define NHMEX_U_MSR_PMON_CTR			0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK		\
+		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
+		 NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
+#define NHMEX_C0_MSR_PMON_CTR0			0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
+#define NHMEX_C_MSR_OFFSET			0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
+#define NHMEX_B0_MSR_PMON_CTR0			0xc31
+#define NHMEX_B0_MSR_PMON_CTL0			0xc30
+#define NHMEX_B_MSR_OFFSET			0x40
+#define NHMEX_B0_MSR_MATCH			0xe45
+#define NHMEX_B0_MSR_MASK			0xe46
+#define NHMEX_B1_MSR_MATCH			0xe4d
+#define NHMEX_B1_MSR_MASK			0xe4e
+
+#define NHMEX_B_PMON_CTL_EN			(1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT		6
+#define NHMEX_B_PMON_CTR_MASK		\
+		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK		\
+		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+		 NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
+#define NHMEX_S0_MSR_PMON_CTR0			0xc51
+#define NHMEX_S0_MSR_PMON_CTL0			0xc50
+#define NHMEX_S_MSR_OFFSET			0x80
+#define NHMEX_S0_MSR_MM_CFG			0xe48
+#define NHMEX_S0_MSR_MATCH			0xe49
+#define NHMEX_S0_MSR_MASK			0xe4a
+#define NHMEX_S1_MSR_MM_CFG			0xe58
+#define NHMEX_S1_MSR_MATCH			0xe59
+#define NHMEX_S1_MSR_MASK			0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV		0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
+#define NHMEX_M0_MSR_PMU_DSP			0xca5
+#define NHMEX_M0_MSR_PMU_ISS			0xca6
+#define NHMEX_M0_MSR_PMU_MAP			0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
+#define NHMEX_M0_MSR_PMU_PGT			0xca9
+#define NHMEX_M0_MSR_PMU_PLD			0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
+#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
+#define NHMEX_M_MSR_OFFSET			0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
+
+#define NHMEX_M_PMON_CTL_EN			(1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
+	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
+	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK			\
+		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
+		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
+		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
+		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+			   NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_SET_FLAG_SEL_MASK, \
+				(u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
+#define NHMEX_R_MSR_PMON_CTL0			0xe10
+#define NHMEX_R_MSR_PMON_CNT0			0xe11
+#define NHMEX_R_MSR_OFFSET			0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
+		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
+		(((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
+		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
+		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN			(1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
+#define NHMEX_W_MSR_PMON_CNT0			0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~((1ULL << uncore_num_counters(box)) - 1);
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= (1ULL << uncore_num_counters(box)) - 1;
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+	else
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
+	.init_box	= nhmex_uncore_msr_init_box,		\
+	.disable_box	= nhmex_uncore_msr_disable_box,		\
+	.enable_box	= nhmex_uncore_msr_enable_box,		\
+	.disable_event	= nhmex_uncore_msr_disable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_edge.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters	= 1,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
+	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
+	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
+	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
+	.ops		= &nhmex_uncore_ops,
+	.format_group	= &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 6,
+	.num_boxes		= 10,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
+	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+	.msr_offsets		= nhmex_cbox_msr_offsets,
+	.pair_ctr_ctl		= 1,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+	.name			= "wbox",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
+	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
+	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
+	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
+	.pair_ctr_ctl		= 1,
+	.event_descs		= nhmex_uncore_wbox_events,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int ctr, ev_sel;
+
+	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+		NHMEX_B_PMON_CTR_SHIFT;
+	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+	/* events that do not use the match/mask registers */
+	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_B0_MSR_MATCH;
+	else
+		reg1->reg = NHMEX_B1_MSR_MATCH;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, reg1->config);
+		wrmsrl(reg1->reg + 1, reg2->config);
+	}
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+	EVENT_CONSTRAINT(0 , 1, 0xc0),
+	EVENT_CONSTRAINT(0x40, 2, 0xc0),
+	EVENT_CONSTRAINT(0x80, 4, 0xc0),
+	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+	EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_counter.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_bbox_msr_enable_event,
+	.hw_config		= nhmex_bbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+	.name			= "bbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_B_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.constraints		= nhmex_uncore_bbox_constraints,
+	.ops			= &nhmex_uncore_bbox_ops,
+	.format_group		= &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	/* only TO_R_PROG_EV event uses the match/mask register */
+	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+	    NHMEX_S_EVENT_TO_R_PROG_EV)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_S0_MSR_MM_CFG;
+	else
+		reg1->reg = NHMEX_S1_MSR_MM_CFG;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, 0);
+		wrmsrl(reg1->reg + 1, reg1->config);
+		wrmsrl(reg1->reg + 2, reg2->config);
+		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+	}
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+	.name			= "format",
+	.attrs			= nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_sbox_msr_enable_event,
+	.hw_config		= nhmex_sbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_S_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.ops			= &nhmex_uncore_sbox_ops,
+	.format_group		= &nhmex_uncore_sbox_format_group
+};
+
+enum {
+	EXTRA_REG_NHMEX_M_FILTER,
+	EXTRA_REG_NHMEX_M_DSP,
+	EXTRA_REG_NHMEX_M_ISS,
+	EXTRA_REG_NHMEX_M_MAP,
+	EXTRA_REG_NHMEX_M_MSC_THR,
+	EXTRA_REG_NHMEX_M_PGT,
+	EXTRA_REG_NHMEX_M_PLD,
+	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+	/* event 0xa uses two extra registers */
+	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+	/* events 0xd ~ 0x10 use the same extra register */
+	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+	EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	bool ret = false;
+	u64 mask;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		raw_spin_lock_irqsave(&er->lock, flags);
+		if (!atomic_read(&er->ref) || er->config == config) {
+			atomic_inc(&er->ref);
+			er->config = config;
+			ret = true;
+		}
+		raw_spin_unlock_irqrestore(&er->lock, flags);
+
+		return ret;
+	}
+	/*
+	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+	 * fields which are shared.
+	 */
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (WARN_ON_ONCE(idx >= 4))
+		return false;
+
+	/* mask of the shared fields */
+	if (uncore_nhmex)
+		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	else
+		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	/* add mask of the non-shared field if it's in use */
+	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+		if (uncore_nhmex)
+			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	}
+
+	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		if (uncore_nhmex)
+			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		er->config &= ~mask;
+		er->config |= (config & mask);
+		ret = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		atomic_dec(&er->ref);
+		return;
+	}
+
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+	u64 config = reg1->config;
+
+	/* get the non-shared control bits and shift them */
+	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (uncore_nhmex)
+		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	else
+		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	if (new_idx > orig_idx) {
+		idx = new_idx - orig_idx;
+		config <<= 3 * idx;
+	} else {
+		idx = orig_idx - new_idx;
+		config >>= 3 * idx;
+	}
+
+	/* add the shared control bits back */
+	if (uncore_nhmex)
+		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	else
+		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	if (modify) {
+		/* adjust the main event selector */
+		if (new_idx > orig_idx)
+			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		else
+			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		reg1->config = config;
+		reg1->idx = ~0xff | new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int i, idx[2], alloc = 0;
+	u64 config1 = reg1->config;
+
+	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+	for (i = 0; i < 2; i++) {
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			idx[i] = 0xff;
+
+		if (idx[i] == 0xff)
+			continue;
+
+		if (!nhmex_mbox_get_shared_reg(box, idx[i],
+				__BITS_VALUE(config1, i, 32)))
+			goto fail;
+		alloc |= (0x1 << i);
+	}
+
+	/* for the match/mask registers */
+	if (reg2->idx != EXTRA_REG_NONE &&
+	    (uncore_box_is_fake(box) || !reg2->alloc) &&
+	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+		goto fail;
+
+	/*
+	 * If it's a fake box -- as per validate_{group,event}() we
+	 * shouldn't touch event state and we can avoid doing so
+	 * since both will only call get_event_constraints() once
+	 * on each event, this avoids the need for reg->alloc.
+	 */
+	if (!uncore_box_is_fake(box)) {
+		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+			nhmex_mbox_alter_er(event, idx[0], true);
+		reg1->alloc |= alloc;
+		if (reg2->idx != EXTRA_REG_NONE)
+			reg2->alloc = 1;
+	}
+	return NULL;
+fail:
+	if (idx[0] != 0xff && !(alloc & 0x1) &&
+	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		/*
+		 * events 0xd ~ 0x10 are functional identical, but are
+		 * controlled by different fields in the ZDP_CTL_FVC
+		 * register. If we failed to take one field, try the
+		 * rest 3 choices.
+		 */
+		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		idx[0] = (idx[0] + 1) % 4;
+		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+			config1 = nhmex_mbox_alter_er(event, idx[0], false);
+			goto again;
+		}
+	}
+
+	if (alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, idx[0]);
+	if (alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, idx[1]);
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	if (reg1->alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+	if (reg1->alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+	reg1->alloc = 0;
+
+	if (reg2->alloc) {
+		nhmex_mbox_put_shared_reg(box, reg2->idx);
+		reg2->alloc = 0;
+	}
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return er->idx;
+	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	struct extra_reg *er;
+	unsigned msr;
+	int reg_idx = 0;
+	/*
+	 * The mbox events may require 2 extra MSRs at the most. But only
+	 * the lower 32 bits in these MSRs are significant, so we can use
+	 * config1 to pass two MSRs' config.
+	 */
+	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+
+		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+			return -EINVAL;
+
+		/* always use the 32~63 bits to pass the PLD config */
+		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+			reg_idx = 1;
+		else if (WARN_ON_ONCE(reg_idx > 0))
+			return -EINVAL;
+
+		reg1->idx &= ~(0xff << (reg_idx * 8));
+		reg1->reg &= ~(0xffff << (reg_idx * 16));
+		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+		reg1->reg |= msr << (reg_idx * 16);
+		reg1->config = event->attr.config1;
+		reg_idx++;
+	}
+	/*
+	 * The mbox only provides ability to perform address matching
+	 * for the PLD events.
+	 */
+	if (reg_idx == 2) {
+		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+			reg2->config = event->attr.config2;
+		else
+			reg2->config = ~0ULL;
+		if (box->pmu->pmu_idx == 0)
+			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+		else
+			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+	}
+	return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return box->shared_regs[idx].config;
+
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx;
+
+	idx = __BITS_VALUE(reg1->idx, 0, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+	idx = __BITS_VALUE(reg1->idx, 1, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+
+	if (reg2->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg2->reg, 0);
+		if (reg2->config != ~0ULL) {
+			wrmsrl(reg2->reg + 1,
+				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+		}
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+	&format_attr_count_mode.attr,
+	&format_attr_storage_mode.attr,
+	&format_attr_wrap_mode.attr,
+	&format_attr_flag_mode.attr,
+	&format_attr_inc_sel.attr,
+	&format_attr_set_flag_sel.attr,
+	&format_attr_filter_cfg_en.attr,
+	&format_attr_filter_match.attr,
+	&format_attr_filter_mask.attr,
+	&format_attr_dsp.attr,
+	&format_attr_thr.attr,
+	&format_attr_fvc.attr,
+	&format_attr_pgt.attr,
+	&format_attr_map.attr,
+	&format_attr_iss.attr,
+	&format_attr_pld.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_mbox_msr_enable_event,
+	.hw_config	= nhmex_mbox_hw_config,
+	.get_constraint	= nhmex_mbox_get_constraint,
+	.put_constraint	= nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+	.name			= "mbox",
+	.num_counters		= 6,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
+	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
+	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_M_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 8,
+	.event_descs		= nhmex_uncore_mbox_events,
+	.ops			= &nhmex_uncore_mbox_ops,
+	.format_group		= &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	/* adjust the main event selector and extra register index */
+	if (reg1->idx % 2) {
+		reg1->idx--;
+		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	} else {
+		reg1->idx++;
+		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	}
+
+	/* adjust extra register config */
+	switch (reg1->idx % 6) {
+	case 2:
+		/* shift the 8~15 bits to the 0~7 bits */
+		reg1->config >>= 8;
+		break;
+	case 3:
+		/* shift the 0~7 bits to the 8~15 bits */
+		reg1->config <<= 8;
+		break;
+	}
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	int idx, er_idx;
+	u64 config1;
+	bool ok = false;
+
+	if (!uncore_box_is_fake(box) && reg1->alloc)
+		return NULL;
+
+	idx = reg1->idx % 6;
+	config1 = reg1->config;
+again:
+	er_idx = idx;
+	/* the 3rd and 4th events use the same extra register */
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (idx < 2) {
+		if (!atomic_read(&er->ref) || er->config == reg1->config) {
+			atomic_inc(&er->ref);
+			er->config = reg1->config;
+			ok = true;
+		}
+	} else if (idx == 2 || idx == 3) {
+		/*
+		 * these two events use different fields in a extra register,
+		 * the 0~7 bits and the 8~15 bits respectively.
+		 */
+		u64 mask = 0xff << ((idx - 2) * 8);
+		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+				!((er->config ^ config1) & mask)) {
+			atomic_add(1 << ((idx - 2) * 8), &er->ref);
+			er->config &= ~mask;
+			er->config |= config1 & mask;
+			ok = true;
+		}
+	} else {
+		if (!atomic_read(&er->ref) ||
+				(er->config == (hwc->config >> 32) &&
+				 er->config1 == reg1->config &&
+				 er->config2 == reg2->config)) {
+			atomic_inc(&er->ref);
+			er->config = (hwc->config >> 32);
+			er->config1 = reg1->config;
+			er->config2 = reg2->config;
+			ok = true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		/*
+		 * The Rbox events are always in pairs. The paired
+		 * events are functional identical, but use different
+		 * extra registers. If we failed to take an extra
+		 * register, try the alternative.
+		 */
+		idx ^= 1;
+		if (idx != reg1->idx % 6) {
+			if (idx == 2)
+				config1 >>= 8;
+			else if (idx == 3)
+				config1 <<= 8;
+			goto again;
+		}
+	} else {
+		if (!uncore_box_is_fake(box)) {
+			if (idx != reg1->idx % 6)
+				nhmex_rbox_alter_er(box, event);
+			reg1->alloc = 1;
+		}
+		return NULL;
+	}
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	int idx, er_idx;
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	idx = reg1->idx % 6;
+	er_idx = idx;
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	if (idx == 2 || idx == 3)
+		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+	else
+		atomic_dec(&er->ref);
+
+	reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int idx;
+
+	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	if (idx >= 0x18)
+		return -EINVAL;
+
+	reg1->idx = idx;
+	reg1->config = event->attr.config1;
+
+	switch (idx % 6) {
+	case 4:
+	case 5:
+		hwc->config |= event->attr.config & (~0ULL << 32);
+		reg2->config = event->attr.config2;
+		break;
+	}
+	return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx, port;
+
+	idx = reg1->idx;
+	port = idx / 6 + box->pmu->pmu_idx * 4;
+
+	switch (idx % 6) {
+	case 0:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+		break;
+	case 1:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+		break;
+	case 2:
+	case 3:
+		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+		break;
+	case 4:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+		break;
+	case 5:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+		break;
+	}
+
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_xbr_mm_cfg.attr,
+	&format_attr_xbr_match.attr,
+	&format_attr_xbr_mask.attr,
+	&format_attr_qlx_cfg.attr,
+	&format_attr_iperf_cfg.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_rbox_msr_enable_event,
+	.hw_config		= nhmex_rbox_hw_config,
+	.get_constraint		= nhmex_rbox_get_constraint,
+	.put_constraint		= nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+	.name			= "rbox",
+	.num_counters		= 8,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
+	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_R_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 20,
+	.event_descs		= nhmex_uncore_rbox_events,
+	.ops			= &nhmex_uncore_rbox_ops,
+	.format_group		= &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+	&nhmex_uncore_ubox,
+	&nhmex_uncore_cbox,
+	&nhmex_uncore_bbox,
+	&nhmex_uncore_sbox,
+	&nhmex_uncore_mbox,
+	&nhmex_uncore_rbox,
+	&nhmex_uncore_wbox,
+	NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+	if (boot_cpu_data.x86_model == 46)
+		uncore_nhmex = true;
+	else
+		nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+	if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 875d1b1f5e32..316f53a7f942 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,8 +34,7 @@ ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
-					   perf_event_intel_uncore_snbep.o \
-					   perf_event_intel_uncore_nhmex.o
+					   perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
deleted file mode 100644
index d70cfe0be86a..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ /dev/null
@@ -1,1221 +0,0 @@
-/* Nehalem-EX/Westmere-EX uncore support */
-#include "../../events/intel/uncore.h"
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
-#define NHMEX_PMON_CTL_INVERT		(1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
-					 NHMEX_PMON_CTL_UMASK_MASK | \
-					 NHMEX_PMON_CTL_EDGE_DET | \
-					 NHMEX_PMON_CTL_INVERT | \
-					 NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define NHMEX_U_MSR_PMON_CTR			0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK		\
-		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
-		 NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
-#define NHMEX_C0_MSR_PMON_CTR0			0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
-#define NHMEX_C_MSR_OFFSET			0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
-#define NHMEX_B0_MSR_PMON_CTR0			0xc31
-#define NHMEX_B0_MSR_PMON_CTL0			0xc30
-#define NHMEX_B_MSR_OFFSET			0x40
-#define NHMEX_B0_MSR_MATCH			0xe45
-#define NHMEX_B0_MSR_MASK			0xe46
-#define NHMEX_B1_MSR_MATCH			0xe4d
-#define NHMEX_B1_MSR_MASK			0xe4e
-
-#define NHMEX_B_PMON_CTL_EN			(1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT		6
-#define NHMEX_B_PMON_CTR_MASK		\
-		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK		\
-		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-		 NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
-#define NHMEX_S0_MSR_PMON_CTR0			0xc51
-#define NHMEX_S0_MSR_PMON_CTL0			0xc50
-#define NHMEX_S_MSR_OFFSET			0x80
-#define NHMEX_S0_MSR_MM_CFG			0xe48
-#define NHMEX_S0_MSR_MATCH			0xe49
-#define NHMEX_S0_MSR_MASK			0xe4a
-#define NHMEX_S1_MSR_MM_CFG			0xe58
-#define NHMEX_S1_MSR_MATCH			0xe59
-#define NHMEX_S1_MSR_MASK			0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV		0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
-#define NHMEX_M0_MSR_PMU_DSP			0xca5
-#define NHMEX_M0_MSR_PMU_ISS			0xca6
-#define NHMEX_M0_MSR_PMU_MAP			0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
-#define NHMEX_M0_MSR_PMU_PGT			0xca9
-#define NHMEX_M0_MSR_PMU_PLD			0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
-#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
-#define NHMEX_M_MSR_OFFSET			0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
-
-#define NHMEX_M_PMON_CTL_EN			(1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
-	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
-	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK			\
-		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
-		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
-		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
-		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-			   NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_SET_FLAG_SEL_MASK, \
-				(u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
-#define NHMEX_R_MSR_PMON_CTL0			0xe10
-#define NHMEX_R_MSR_PMON_CNT0			0xe11
-#define NHMEX_R_MSR_OFFSET			0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
-		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
-		(((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
-		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
-		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN			(1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
-#define NHMEX_W_MSR_PMON_CNT0			0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-				((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~((1ULL << uncore_num_counters(box)) - 1);
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= (1ULL << uncore_num_counters(box)) - 1;
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-	else
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
-	.init_box	= nhmex_uncore_msr_init_box,		\
-	.disable_box	= nhmex_uncore_msr_disable_box,		\
-	.enable_box	= nhmex_uncore_msr_enable_box,		\
-	.disable_event	= nhmex_uncore_msr_disable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_edge.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters	= 1,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
-	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
-	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
-	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
-	.ops		= &nhmex_uncore_ops,
-	.format_group	= &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 6,
-	.num_boxes		= 10,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
-	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-	.msr_offsets		= nhmex_cbox_msr_offsets,
-	.pair_ctr_ctl		= 1,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-	.name			= "wbox",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
-	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
-	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
-	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
-	.pair_ctr_ctl		= 1,
-	.event_descs		= nhmex_uncore_wbox_events,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int ctr, ev_sel;
-
-	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-		NHMEX_B_PMON_CTR_SHIFT;
-	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-	/* events that do not use the match/mask registers */
-	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_B0_MSR_MATCH;
-	else
-		reg1->reg = NHMEX_B1_MSR_MATCH;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, reg1->config);
-		wrmsrl(reg1->reg + 1, reg2->config);
-	}
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-	EVENT_CONSTRAINT(0 , 1, 0xc0),
-	EVENT_CONSTRAINT(0x40, 2, 0xc0),
-	EVENT_CONSTRAINT(0x80, 4, 0xc0),
-	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-	EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_counter.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_bbox_msr_enable_event,
-	.hw_config		= nhmex_bbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-	.name			= "bbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_B_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.constraints		= nhmex_uncore_bbox_constraints,
-	.ops			= &nhmex_uncore_bbox_ops,
-	.format_group		= &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	/* only TO_R_PROG_EV event uses the match/mask register */
-	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-	    NHMEX_S_EVENT_TO_R_PROG_EV)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_S0_MSR_MM_CFG;
-	else
-		reg1->reg = NHMEX_S1_MSR_MM_CFG;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, 0);
-		wrmsrl(reg1->reg + 1, reg1->config);
-		wrmsrl(reg1->reg + 2, reg2->config);
-		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-	}
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-	.name			= "format",
-	.attrs			= nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_sbox_msr_enable_event,
-	.hw_config		= nhmex_sbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_S_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.ops			= &nhmex_uncore_sbox_ops,
-	.format_group		= &nhmex_uncore_sbox_format_group
-};
-
-enum {
-	EXTRA_REG_NHMEX_M_FILTER,
-	EXTRA_REG_NHMEX_M_DSP,
-	EXTRA_REG_NHMEX_M_ISS,
-	EXTRA_REG_NHMEX_M_MAP,
-	EXTRA_REG_NHMEX_M_MSC_THR,
-	EXTRA_REG_NHMEX_M_PGT,
-	EXTRA_REG_NHMEX_M_PLD,
-	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-	/* event 0xa uses two extra registers */
-	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-	/* events 0xd ~ 0x10 use the same extra register */
-	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-	EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	bool ret = false;
-	u64 mask;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		raw_spin_lock_irqsave(&er->lock, flags);
-		if (!atomic_read(&er->ref) || er->config == config) {
-			atomic_inc(&er->ref);
-			er->config = config;
-			ret = true;
-		}
-		raw_spin_unlock_irqrestore(&er->lock, flags);
-
-		return ret;
-	}
-	/*
-	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-	 * fields which are shared.
-	 */
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (WARN_ON_ONCE(idx >= 4))
-		return false;
-
-	/* mask of the shared fields */
-	if (uncore_nhmex)
-		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	else
-		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	/* add mask of the non-shared field if it's in use */
-	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-		if (uncore_nhmex)
-			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	}
-
-	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		if (uncore_nhmex)
-			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		er->config &= ~mask;
-		er->config |= (config & mask);
-		ret = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		atomic_dec(&er->ref);
-		return;
-	}
-
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-	u64 config = reg1->config;
-
-	/* get the non-shared control bits and shift them */
-	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (uncore_nhmex)
-		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	else
-		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	if (new_idx > orig_idx) {
-		idx = new_idx - orig_idx;
-		config <<= 3 * idx;
-	} else {
-		idx = orig_idx - new_idx;
-		config >>= 3 * idx;
-	}
-
-	/* add the shared control bits back */
-	if (uncore_nhmex)
-		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	else
-		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	if (modify) {
-		/* adjust the main event selector */
-		if (new_idx > orig_idx)
-			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		else
-			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		reg1->config = config;
-		reg1->idx = ~0xff | new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int i, idx[2], alloc = 0;
-	u64 config1 = reg1->config;
-
-	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-	for (i = 0; i < 2; i++) {
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			idx[i] = 0xff;
-
-		if (idx[i] == 0xff)
-			continue;
-
-		if (!nhmex_mbox_get_shared_reg(box, idx[i],
-				__BITS_VALUE(config1, i, 32)))
-			goto fail;
-		alloc |= (0x1 << i);
-	}
-
-	/* for the match/mask registers */
-	if (reg2->idx != EXTRA_REG_NONE &&
-	    (uncore_box_is_fake(box) || !reg2->alloc) &&
-	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-		goto fail;
-
-	/*
-	 * If it's a fake box -- as per validate_{group,event}() we
-	 * shouldn't touch event state and we can avoid doing so
-	 * since both will only call get_event_constraints() once
-	 * on each event, this avoids the need for reg->alloc.
-	 */
-	if (!uncore_box_is_fake(box)) {
-		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-			nhmex_mbox_alter_er(event, idx[0], true);
-		reg1->alloc |= alloc;
-		if (reg2->idx != EXTRA_REG_NONE)
-			reg2->alloc = 1;
-	}
-	return NULL;
-fail:
-	if (idx[0] != 0xff && !(alloc & 0x1) &&
-	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		/*
-		 * events 0xd ~ 0x10 are functional identical, but are
-		 * controlled by different fields in the ZDP_CTL_FVC
-		 * register. If we failed to take one field, try the
-		 * rest 3 choices.
-		 */
-		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		idx[0] = (idx[0] + 1) % 4;
-		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-			config1 = nhmex_mbox_alter_er(event, idx[0], false);
-			goto again;
-		}
-	}
-
-	if (alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, idx[0]);
-	if (alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, idx[1]);
-	return &uncore_constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	if (reg1->alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-	if (reg1->alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-	reg1->alloc = 0;
-
-	if (reg2->alloc) {
-		nhmex_mbox_put_shared_reg(box, reg2->idx);
-		reg2->alloc = 0;
-	}
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return er->idx;
-	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_type *type = box->pmu->type;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	struct extra_reg *er;
-	unsigned msr;
-	int reg_idx = 0;
-	/*
-	 * The mbox events may require 2 extra MSRs at the most. But only
-	 * the lower 32 bits in these MSRs are significant, so we can use
-	 * config1 to pass two MSRs' config.
-	 */
-	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		if (event->attr.config1 & ~er->valid_mask)
-			return -EINVAL;
-
-		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-			return -EINVAL;
-
-		/* always use the 32~63 bits to pass the PLD config */
-		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-			reg_idx = 1;
-		else if (WARN_ON_ONCE(reg_idx > 0))
-			return -EINVAL;
-
-		reg1->idx &= ~(0xff << (reg_idx * 8));
-		reg1->reg &= ~(0xffff << (reg_idx * 16));
-		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-		reg1->reg |= msr << (reg_idx * 16);
-		reg1->config = event->attr.config1;
-		reg_idx++;
-	}
-	/*
-	 * The mbox only provides ability to perform address matching
-	 * for the PLD events.
-	 */
-	if (reg_idx == 2) {
-		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-			reg2->config = event->attr.config2;
-		else
-			reg2->config = ~0ULL;
-		if (box->pmu->pmu_idx == 0)
-			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-		else
-			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-	}
-	return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	u64 config;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return box->shared_regs[idx].config;
-
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	config = er->config;
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx;
-
-	idx = __BITS_VALUE(reg1->idx, 0, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-	idx = __BITS_VALUE(reg1->idx, 1, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-
-	if (reg2->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg2->reg, 0);
-		if (reg2->config != ~0ULL) {
-			wrmsrl(reg2->reg + 1,
-				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-		}
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-	&format_attr_count_mode.attr,
-	&format_attr_storage_mode.attr,
-	&format_attr_wrap_mode.attr,
-	&format_attr_flag_mode.attr,
-	&format_attr_inc_sel.attr,
-	&format_attr_set_flag_sel.attr,
-	&format_attr_filter_cfg_en.attr,
-	&format_attr_filter_match.attr,
-	&format_attr_filter_mask.attr,
-	&format_attr_dsp.attr,
-	&format_attr_thr.attr,
-	&format_attr_fvc.attr,
-	&format_attr_pgt.attr,
-	&format_attr_map.attr,
-	&format_attr_iss.attr,
-	&format_attr_pld.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_mbox_msr_enable_event,
-	.hw_config	= nhmex_mbox_hw_config,
-	.get_constraint	= nhmex_mbox_get_constraint,
-	.put_constraint	= nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-	.name			= "mbox",
-	.num_counters		= 6,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
-	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
-	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_M_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 8,
-	.event_descs		= nhmex_uncore_mbox_events,
-	.ops			= &nhmex_uncore_mbox_ops,
-	.format_group		= &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	/* adjust the main event selector and extra register index */
-	if (reg1->idx % 2) {
-		reg1->idx--;
-		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	} else {
-		reg1->idx++;
-		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	}
-
-	/* adjust extra register config */
-	switch (reg1->idx % 6) {
-	case 2:
-		/* shift the 8~15 bits to the 0~7 bits */
-		reg1->config >>= 8;
-		break;
-	case 3:
-		/* shift the 0~7 bits to the 8~15 bits */
-		reg1->config <<= 8;
-		break;
-	}
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	int idx, er_idx;
-	u64 config1;
-	bool ok = false;
-
-	if (!uncore_box_is_fake(box) && reg1->alloc)
-		return NULL;
-
-	idx = reg1->idx % 6;
-	config1 = reg1->config;
-again:
-	er_idx = idx;
-	/* the 3rd and 4th events use the same extra register */
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (idx < 2) {
-		if (!atomic_read(&er->ref) || er->config == reg1->config) {
-			atomic_inc(&er->ref);
-			er->config = reg1->config;
-			ok = true;
-		}
-	} else if (idx == 2 || idx == 3) {
-		/*
-		 * these two events use different fields in a extra register,
-		 * the 0~7 bits and the 8~15 bits respectively.
-		 */
-		u64 mask = 0xff << ((idx - 2) * 8);
-		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-				!((er->config ^ config1) & mask)) {
-			atomic_add(1 << ((idx - 2) * 8), &er->ref);
-			er->config &= ~mask;
-			er->config |= config1 & mask;
-			ok = true;
-		}
-	} else {
-		if (!atomic_read(&er->ref) ||
-				(er->config == (hwc->config >> 32) &&
-				 er->config1 == reg1->config &&
-				 er->config2 == reg2->config)) {
-			atomic_inc(&er->ref);
-			er->config = (hwc->config >> 32);
-			er->config1 = reg1->config;
-			er->config2 = reg2->config;
-			ok = true;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		/*
-		 * The Rbox events are always in pairs. The paired
-		 * events are functional identical, but use different
-		 * extra registers. If we failed to take an extra
-		 * register, try the alternative.
-		 */
-		idx ^= 1;
-		if (idx != reg1->idx % 6) {
-			if (idx == 2)
-				config1 >>= 8;
-			else if (idx == 3)
-				config1 <<= 8;
-			goto again;
-		}
-	} else {
-		if (!uncore_box_is_fake(box)) {
-			if (idx != reg1->idx % 6)
-				nhmex_rbox_alter_er(box, event);
-			reg1->alloc = 1;
-		}
-		return NULL;
-	}
-	return &uncore_constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	int idx, er_idx;
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	idx = reg1->idx % 6;
-	er_idx = idx;
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	if (idx == 2 || idx == 3)
-		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-	else
-		atomic_dec(&er->ref);
-
-	reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int idx;
-
-	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	if (idx >= 0x18)
-		return -EINVAL;
-
-	reg1->idx = idx;
-	reg1->config = event->attr.config1;
-
-	switch (idx % 6) {
-	case 4:
-	case 5:
-		hwc->config |= event->attr.config & (~0ULL << 32);
-		reg2->config = event->attr.config2;
-		break;
-	}
-	return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx, port;
-
-	idx = reg1->idx;
-	port = idx / 6 + box->pmu->pmu_idx * 4;
-
-	switch (idx % 6) {
-	case 0:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-		break;
-	case 1:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-		break;
-	case 2:
-	case 3:
-		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-		break;
-	case 4:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-		break;
-	case 5:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-		break;
-	}
-
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_xbr_mm_cfg.attr,
-	&format_attr_xbr_match.attr,
-	&format_attr_xbr_mask.attr,
-	&format_attr_qlx_cfg.attr,
-	&format_attr_iperf_cfg.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_rbox_msr_enable_event,
-	.hw_config		= nhmex_rbox_hw_config,
-	.get_constraint		= nhmex_rbox_get_constraint,
-	.put_constraint		= nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-	.name			= "rbox",
-	.num_counters		= 8,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
-	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_R_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 20,
-	.event_descs		= nhmex_uncore_rbox_events,
-	.ops			= &nhmex_uncore_rbox_ops,
-	.format_group		= &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-	&nhmex_uncore_ubox,
-	&nhmex_uncore_cbox,
-	&nhmex_uncore_bbox,
-	&nhmex_uncore_sbox,
-	&nhmex_uncore_mbox,
-	&nhmex_uncore_rbox,
-	&nhmex_uncore_wbox,
-	NULL,
-};
-
-void nhmex_uncore_cpu_init(void)
-{
-	if (boot_cpu_data.x86_model == 46)
-		uncore_nhmex = true;
-	else
-		nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-	if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = nhmex_msr_uncores;
-}
-/* end of Nehalem-EX uncore support */
-- 
cgit 


From 92553e40c6292408faa069b34a6db0dab4055080 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:17 +0100
Subject: perf/x86: Move perf_event_intel_uncore_snb.c   =>
 x86/events/intel/uncore_snb.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                          |   1 +
 arch/x86/events/intel/uncore_snb.c                | 717 ++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                      |   3 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 717 ----------------------
 4 files changed, 719 insertions(+), 719 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore_snb.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 70aedfea45b4..eb0d92198a56 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
new file mode 100644
index 000000000000..2049d26a7ae6
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,717 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC	0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC	0x191f
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
+#define SNB_UNC_CTL_EN				(1 << 22)
+#define SNB_UNC_CTL_INVERT			(1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK			0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0			0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET			0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0) {
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+	}
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask5.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+	.name		= "format",
+	.attrs		= snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+	.init_box	= snb_uncore_msr_init_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= snb_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+	.name		= "cbox",
+	.num_counters   = 2,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
+	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
+	.fixed_ctr	= SNB_UNC_FIXED_CTR,
+	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
+	.single_fixed	= 1,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+	.event_descs	= snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+	.name		= "arb",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
+	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
+	.constraints	= snb_uncore_arb_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+	&snb_uncore_cbox,
+	&snb_uncore_arb,
+	NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = snb_msr_uncores;
+	if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+	SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+	{ /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+	.name = "format",
+	.attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+	resource_size_t addr;
+	u32 pci_dword;
+
+	pci_read_config_dword(pdev, where, &pci_dword);
+	addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	pci_read_config_dword(pdev, where + 4, &pci_dword);
+	addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+	addr &= ~(PAGE_SIZE - 1);
+
+	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+	int idx, base;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+		return -EINVAL;
+
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+	/*
+	 * check event is known (whitelist, determines counter)
+	 */
+	switch (cfg) {
+	case SNB_UNCORE_PCI_IMC_DATA_READS:
+		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+		idx = UNCORE_PMC_IDX_FIXED;
+		break;
+	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+		idx = UNCORE_PMC_IDX_FIXED + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* must be done before validate_group */
+	event->hw.event_base = base;
+	event->hw.config = cfg;
+	event->hw.idx = idx;
+
+	/* no group validation needed, we have free running counters */
+
+	return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	u64 count;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+	box->n_active++;
+
+	list_add_tail(&event->active_entry, &box->active_list);
+
+	count = snb_uncore_imc_read_counter(box, event);
+	local64_set(&event->hw.prev_count, count);
+
+	if (box->n_active == 1)
+		uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		box->n_active--;
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		list_del(&event->active_entry);
+
+		if (box->n_active == 0)
+			uncore_pmu_cancel_hrtimer(box);
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box)
+		return -ENODEV;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	snb_uncore_imc_event_start(event, 0);
+
+	box->n_events++;
+
+	return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			--box->n_events;
+			break;
+		}
+	}
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+	struct pci_dev *dev = NULL;
+	struct pci2phy_map *map;
+	int bus, segment;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+	if (!dev)
+		return -ENOTTY;
+
+	bus = dev->bus->number;
+	segment = pci_domain_nr(dev->bus);
+
+	raw_spin_lock(&pci2phy_map_lock);
+	map = __find_pci2phy_map(segment);
+	if (!map) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		pci_dev_put(dev);
+		return -ENOMEM;
+	}
+	map->pbus_to_physid[bus] = 0;
+	raw_spin_unlock(&pci2phy_map_lock);
+
+	pci_dev_put(dev);
+
+	return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= snb_uncore_imc_event_init,
+	.add		= snb_uncore_imc_event_add,
+	.del		= snb_uncore_imc_event_del,
+	.start		= snb_uncore_imc_event_start,
+	.stop		= snb_uncore_imc_event_stop,
+	.read		= uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+	.init_box	= snb_uncore_imc_init_box,
+	.enable_box	= snb_uncore_imc_enable_box,
+	.disable_box	= snb_uncore_imc_disable_box,
+	.disable_event	= snb_uncore_imc_disable_event,
+	.enable_event	= snb_uncore_imc_enable_event,
+	.hw_config	= snb_uncore_imc_hw_config,
+	.read_counter	= snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.fixed_ctr_bits	= 32,
+	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
+	.event_descs	= snb_uncore_imc_events,
+	.format_group	= &snb_uncore_imc_format_group,
+	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
+	.ops		= &snb_uncore_imc_ops,
+	.pmu		= &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
+	NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+	.name		= "snb_uncore",
+	.id_table	= snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+	.name		= "ivb_uncore",
+	.id_table	= ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+	.name		= "hsw_uncore",
+	.id_table	= hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+	.name		= "bdw_uncore",
+	.id_table	= bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+	.name		= "skl_uncore",
+	.id_table	= skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+	__u32 pci_id;
+	struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+	{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+	IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+	IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
+	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
+	IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
+	{  /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+	for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+	const struct imc_uncore_pci_dev *p;
+	int ret;
+
+	for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+		ret = snb_pci2phy_map_init(p->pci_id);
+		if (ret == 0)
+			return p->driver;
+	}
+	return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+	struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+	if (!imc_drv)
+		return -ENODEV;
+
+	uncore_pci_uncores = snb_pci_uncores;
+	uncore_pci_driver = imc_drv;
+
+	return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask8.attr,
+	NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+	.name = "format",
+	.attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+	.disable_box	= nhm_uncore_msr_disable_box,
+	.enable_box	= nhm_uncore_msr_enable_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= nhm_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+	.name		= "",
+	.num_counters   = 8,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.event_ctl	= NHM_UNC_PERFEVTSEL0,
+	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
+	.fixed_ctr	= NHM_UNC_FIXED_CTR,
+	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
+	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
+	.event_descs	= nhm_uncore_events,
+	.ops		= &nhm_uncore_msr_ops,
+	.format_group	= &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+	&nhm_uncore,
+	NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 316f53a7f942..696d106091db 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,8 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
-					   perf_event_intel_uncore_snbep.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
deleted file mode 100644
index e0e41f55a20c..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ /dev/null
@@ -1,717 +0,0 @@
-/* Nehalem/SandBridge/Haswell uncore support */
-#include "../../events/intel/uncore.h"
-
-/* Uncore IMC PCI IDs */
-#define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC	0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
-#define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
-#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
-#define PCI_DEVICE_ID_INTEL_SKL_IMC	0x191f
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
-#define SNB_UNC_CTL_EN				(1 << 22)
-#define SNB_UNC_CTL_INVERT			(1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK			0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* SNB ARB register */
-#define SNB_UNC_ARB_PER_CTR0			0x3b0
-#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
-#define SNB_UNC_ARB_MSR_OFFSET			0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->pmu_idx == 0) {
-		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-	}
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask5.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-	.name		= "format",
-	.attrs		= snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-	.init_box	= snb_uncore_msr_init_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= snb_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_arb_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-	.name		= "cbox",
-	.num_counters   = 2,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
-	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
-	.fixed_ctr	= SNB_UNC_FIXED_CTR,
-	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
-	.single_fixed	= 1,
-	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
-	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.ops		= &snb_uncore_msr_ops,
-	.format_group	= &snb_uncore_format_group,
-	.event_descs	= snb_uncore_events,
-};
-
-static struct intel_uncore_type snb_uncore_arb = {
-	.name		= "arb",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
-	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
-	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
-	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
-	.constraints	= snb_uncore_arb_constraints,
-	.ops		= &snb_uncore_msr_ops,
-	.format_group	= &snb_uncore_format_group,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-	&snb_uncore_cbox,
-	&snb_uncore_arb,
-	NULL,
-};
-
-void snb_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = snb_msr_uncores;
-	if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-}
-
-enum {
-	SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-	{ /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-	.name = "format",
-	.attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-	resource_size_t addr;
-	u32 pci_dword;
-
-	pci_read_config_dword(pdev, where, &pci_dword);
-	addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-	pci_read_config_dword(pdev, where + 4, &pci_dword);
-	addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-	addr &= ~(PAGE_SIZE - 1);
-
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-	int idx, base;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	pmu = uncore_event_to_pmu(event);
-	/* no device found for this pmu */
-	if (pmu->func_id < 0)
-		return -ENOENT;
-
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	/*
-	 * Place all uncore events for a particular physical package
-	 * onto a single cpu
-	 */
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	/* check only supported bits are set */
-	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-		return -EINVAL;
-
-	box = uncore_pmu_to_box(pmu, event->cpu);
-	if (!box || box->cpu < 0)
-		return -EINVAL;
-
-	event->cpu = box->cpu;
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-	/*
-	 * check event is known (whitelist, determines counter)
-	 */
-	switch (cfg) {
-	case SNB_UNCORE_PCI_IMC_DATA_READS:
-		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-		idx = UNCORE_PMC_IDX_FIXED;
-		break;
-	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-		idx = UNCORE_PMC_IDX_FIXED + 1;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* must be done before validate_group */
-	event->hw.event_base = base;
-	event->hw.config = cfg;
-	event->hw.idx = idx;
-
-	/* no group validation needed, we have free running counters */
-
-	return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	u64 count;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-	box->n_active++;
-
-	list_add_tail(&event->active_entry, &box->active_list);
-
-	count = snb_uncore_imc_read_counter(box, event);
-	local64_set(&event->hw.prev_count, count);
-
-	if (box->n_active == 1)
-		uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		box->n_active--;
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		list_del(&event->active_entry);
-
-		if (box->n_active == 0)
-			uncore_pmu_cancel_hrtimer(box);
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!box)
-		return -ENODEV;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	snb_uncore_imc_event_start(event, 0);
-
-	box->n_events++;
-
-	return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int i;
-
-	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < box->n_events; i++) {
-		if (event == box->event_list[i]) {
-			--box->n_events;
-			break;
-		}
-	}
-}
-
-int snb_pci2phy_map_init(int devid)
-{
-	struct pci_dev *dev = NULL;
-	struct pci2phy_map *map;
-	int bus, segment;
-
-	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-	if (!dev)
-		return -ENOTTY;
-
-	bus = dev->bus->number;
-	segment = pci_domain_nr(dev->bus);
-
-	raw_spin_lock(&pci2phy_map_lock);
-	map = __find_pci2phy_map(segment);
-	if (!map) {
-		raw_spin_unlock(&pci2phy_map_lock);
-		pci_dev_put(dev);
-		return -ENOMEM;
-	}
-	map->pbus_to_physid[bus] = 0;
-	raw_spin_unlock(&pci2phy_map_lock);
-
-	pci_dev_put(dev);
-
-	return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= snb_uncore_imc_event_init,
-	.add		= snb_uncore_imc_event_add,
-	.del		= snb_uncore_imc_event_del,
-	.start		= snb_uncore_imc_event_start,
-	.stop		= snb_uncore_imc_event_stop,
-	.read		= uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-	.init_box	= snb_uncore_imc_init_box,
-	.enable_box	= snb_uncore_imc_enable_box,
-	.disable_box	= snb_uncore_imc_disable_box,
-	.disable_event	= snb_uncore_imc_disable_event,
-	.enable_event	= snb_uncore_imc_enable_event,
-	.hw_config	= snb_uncore_imc_hw_config,
-	.read_counter	= snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.fixed_ctr_bits	= 32,
-	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
-	.event_descs	= snb_uncore_imc_events,
-	.format_group	= &snb_uncore_imc_format_group,
-	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
-	.ops		= &snb_uncore_imc_ops,
-	.pmu		= &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
-	NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id bdw_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id skl_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-	.name		= "snb_uncore",
-	.id_table	= snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-	.name		= "ivb_uncore",
-	.id_table	= ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-	.name		= "hsw_uncore",
-	.id_table	= hsw_uncore_pci_ids,
-};
-
-static struct pci_driver bdw_uncore_pci_driver = {
-	.name		= "bdw_uncore",
-	.id_table	= bdw_uncore_pci_ids,
-};
-
-static struct pci_driver skl_uncore_pci_driver = {
-	.name		= "skl_uncore",
-	.id_table	= skl_uncore_pci_ids,
-};
-
-struct imc_uncore_pci_dev {
-	__u32 pci_id;
-	struct pci_driver *driver;
-};
-#define IMC_DEV(a, d) \
-	{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
-
-static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
-	IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
-	IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
-	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
-	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
-	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
-	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
-	IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
-	{  /* end marker */ }
-};
-
-
-#define for_each_imc_pci_id(x, t) \
-	for (x = (t); (x)->pci_id; x++)
-
-static struct pci_driver *imc_uncore_find_dev(void)
-{
-	const struct imc_uncore_pci_dev *p;
-	int ret;
-
-	for_each_imc_pci_id(p, desktop_imc_pci_ids) {
-		ret = snb_pci2phy_map_init(p->pci_id);
-		if (ret == 0)
-			return p->driver;
-	}
-	return NULL;
-}
-
-static int imc_uncore_pci_init(void)
-{
-	struct pci_driver *imc_drv = imc_uncore_find_dev();
-
-	if (!imc_drv)
-		return -ENODEV;
-
-	uncore_pci_uncores = snb_pci_uncores;
-	uncore_pci_driver = imc_drv;
-
-	return 0;
-}
-
-int snb_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int ivb_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-int hsw_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int bdw_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int skl_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask8.attr,
-	NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-	.name = "format",
-	.attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-	.disable_box	= nhm_uncore_msr_disable_box,
-	.enable_box	= nhm_uncore_msr_enable_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= nhm_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-	.name		= "",
-	.num_counters   = 8,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.event_ctl	= NHM_UNC_PERFEVTSEL0,
-	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
-	.fixed_ctr	= NHM_UNC_FIXED_CTR,
-	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
-	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
-	.event_descs	= nhm_uncore_events,
-	.ops		= &nhm_uncore_msr_ops,
-	.format_group	= &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-	&nhm_uncore,
-	NULL,
-};
-
-void nhm_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = nhm_msr_uncores;
-}
-
-/* end of Nehalem uncore support */
-- 
cgit 


From ed367e6ca42716a11a6d1b5162fdd378f9494eff Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:18 +0100
Subject: perf/x86: Move perf_event_intel_uncore_snbep.c =>
 x86/events/intel/uncore_snbep.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile                           |    2 +-
 arch/x86/events/intel/uncore_snbep.c               | 3125 ++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile                       |    1 -
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 3125 --------------------
 4 files changed, 3126 insertions(+), 3127 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore_snbep.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index eb0d92198a56..8c939ca26f6d 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -9,4 +9,4 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
new file mode 100644
index 000000000000..0c801f77c03d
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,3125 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
+					 SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
+#define SNBEP_PMON_CTL_RST		(1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
+#define SNBEP_PMON_CTL_EN		(1 << 22)
+#define SNBEP_PMON_CTL_INVERT		(1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_INVERT | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL			0xf4
+#define SNBEP_PCI_PMON_CTL0			0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0			0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0			0xc16
+#define SNBEP_U_MSR_PMON_CTL0			0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0			0xd16
+#define SNBEP_C0_MSR_PMON_CTL0			0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
+#define SNBEP_CBO_MSR_OFFSET			0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
+	.event = (e),				\
+	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
+	.config_mask = (m),			\
+	.idx = (i)				\
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK		(IVBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK		\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0			0x709
+#define HSWEP_U_MSR_PMON_CTL0			0x705
+#define HSWEP_U_MSR_PMON_FILTER			0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL		0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR		0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID		(0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID		(0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+					(HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+					 HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0			0xe08
+#define HSWEP_C0_MSR_PMON_CTL0			0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL			0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0		0xe05
+#define HSWEP_CBO_MSR_OFFSET			0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0			0x726
+#define HSWEP_S0_MSR_PMON_CTL0			0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL			0x720
+#define HSWEP_SBOX_MSR_OFFSET			0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0			0x717
+#define HSWEP_PCU_MSR_PMON_CTL0			0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+						SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET			0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+					 KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
+#define KNL_UCLK_MSR_PMON_CTL0			0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
+#define KNL_PMON_FIXED_CTL_EN			0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_CBO_PMON_CTL_TID_EN | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr)
+		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_nid.attr,
+	&format_attr_filter_state.attr,
+	&format_attr_filter_opc.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),			\
+	.init_box	= snbep_uncore_msr_init_box		\
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
+	.init_box	= snbep_uncore_pci_init_box,		\
+	.disable_box	= snbep_uncore_pci_disable_box,		\
+	.enable_box	= snbep_uncore_pci_enable_box,		\
+	.disable_event	= snbep_uncore_pci_disable_event,	\
+	.read_counter	= snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event	= snbep_uncore_pci_enable_event,	\
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &snbep_uncore_msr_ops,
+	.format_group	= &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+	EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	for (i = 0; i < 5; i++) {
+		if (reg1->alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+			    u64 (*cbox_filter_mask)(int fields))
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i, alloc = 0;
+	unsigned long flags;
+	u64 mask;
+
+	if (reg1->idx == EXTRA_REG_NONE)
+		return NULL;
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	for (i = 0; i < 5; i++) {
+		if (!(reg1->idx & (0x1 << i)))
+			continue;
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			continue;
+
+		mask = cbox_filter_mask(0x1 << i);
+		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+		    !((reg1->config ^ er->config) & mask)) {
+			atomic_add(1 << (i * 6), &er->ref);
+			er->config &= ~mask;
+			er->config |= reg1->config & mask;
+			alloc |= (0x1 << i);
+		} else {
+			break;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	if (i < 5)
+		goto fail;
+
+	if (!uncore_box_is_fake(box))
+		reg1->alloc |= alloc;
+
+	return NULL;
+fail:
+	for (; i >= 0; i--) {
+		if (alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x4)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+	return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_cbox_hw_config,
+	.get_constraint		= snbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &snbep_uncore_cbox_ops,
+	.format_group		= &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 config = reg1->config;
+
+	if (new_idx > reg1->idx)
+		config <<= 8 * (new_idx - reg1->idx);
+	else
+		config >>= 8 * (reg1->idx - new_idx);
+
+	if (modify) {
+		hwc->config += new_idx - reg1->idx;
+		reg1->config = config;
+		reg1->idx = new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	unsigned long flags;
+	int idx = reg1->idx;
+	u64 mask, config1 = reg1->config;
+	bool ok = false;
+
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+again:
+	mask = 0xffULL << (idx * 8);
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+	    !((config1 ^ er->config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		er->config &= ~mask;
+		er->config |= config1 & mask;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		idx = (idx + 1) % 4;
+		if (idx != reg1->idx) {
+			config1 = snbep_pcu_alter_er(event, idx, false);
+			goto again;
+		}
+		return &uncore_constraint_empty;
+	}
+
+	if (!uncore_box_is_fake(box)) {
+		if (idx != reg1->idx)
+			snbep_pcu_alter_er(event, idx, true);
+		reg1->alloc = 1;
+	}
+	return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	atomic_sub(1 << (reg1->idx * 8), &er->ref);
+	reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+	if (ev_sel >= 0xb && ev_sel <= 0xe) {
+		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+		reg1->idx = ev_sel - 0xb;
+		reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+	&snbep_uncore_ubox,
+	&snbep_uncore_cbox,
+	&snbep_uncore_pcu,
+	NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+	if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+	SNBEP_PCI_QPI_PORT0_FILTER,
+	SNBEP_PCI_QPI_PORT1_FILTER,
+	HSWEP_PCI_PCU_3,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+		reg1->idx = 0;
+		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+		reg1->config = event->attr.config1;
+		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+		reg2->config = event->attr.config2;
+	}
+	return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+		if (filter_pdev) {
+			pci_write_config_dword(filter_pdev, reg1->reg,
+						(u32)reg1->config);
+			pci_write_config_dword(filter_pdev, reg1->reg + 4,
+						(u32)(reg1->config >> 32));
+			pci_write_config_dword(filter_pdev, reg2->reg,
+						(u32)reg2->config);
+			pci_write_config_dword(filter_pdev, reg2->reg + 4,
+						(u32)(reg2->config >> 32));
+		}
+	}
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event		= snbep_qpi_enable_event,
+	.hw_config		= snbep_qpi_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &snbep_uncore_pci_ops,		\
+	.format_group	= &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.event_descs		= snbep_uncore_qpi_events,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	SNBEP_PCI_UNCORE_HA,
+	SNBEP_PCI_UNCORE_IMC,
+	SNBEP_PCI_UNCORE_QPI,
+	SNBEP_PCI_UNCORE_R2PCIE,
+	SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
+	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
+	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
+	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
+	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+	{ /* Home Agent */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* QPI Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+	.name		= "snbep_uncore",
+	.id_table	= snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+	struct pci_dev *ubox_dev = NULL;
+	int i, bus, nodeid, segment;
+	struct pci2phy_map *map;
+	int err = 0;
+	u32 config = 0;
+
+	while (1) {
+		/* find the UBOX device */
+		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+		if (!ubox_dev)
+			break;
+		bus = ubox_dev->bus->number;
+		/* get the Node ID of the local register */
+		err = pci_read_config_dword(ubox_dev, 0x40, &config);
+		if (err)
+			break;
+		nodeid = config;
+		/* get the Node ID mapping */
+		err = pci_read_config_dword(ubox_dev, 0x54, &config);
+		if (err)
+			break;
+
+		segment = pci_domain_nr(ubox_dev->bus);
+		raw_spin_lock(&pci2phy_map_lock);
+		map = __find_pci2phy_map(segment);
+		if (!map) {
+			raw_spin_unlock(&pci2phy_map_lock);
+			err = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * every three bits in the Node ID mapping register maps
+		 * to a particular node.
+		 */
+		for (i = 0; i < 8; i++) {
+			if (nodeid == ((config >> (3 * i)) & 0x7)) {
+				map->pbus_to_physid[bus] = i;
+				break;
+			}
+		}
+		raw_spin_unlock(&pci2phy_map_lock);
+	}
+
+	if (!err) {
+		/*
+		 * For PCI bus with no UBOX device, find the next bus
+		 * that has UBOX device and use its mapping.
+		 */
+		raw_spin_lock(&pci2phy_map_lock);
+		list_for_each_entry(map, &pci2phy_map_head, list) {
+			i = -1;
+			for (bus = 255; bus >= 0; bus--) {
+				if (map->pbus_to_physid[bus] >= 0)
+					i = map->pbus_to_physid[bus];
+				else
+					map->pbus_to_physid[bus] = i;
+			}
+		}
+		raw_spin_unlock(&pci2phy_map_lock);
+	}
+
+	pci_dev_put(ubox_dev);
+
+	return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x3ce0);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snbep_pci_uncores;
+	uncore_pci_driver = &snbep_uncore_pci_driver;
+	return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	if (msr)
+		wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.init_box	= ivbep_uncore_msr_init_box,		\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= IVBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &ivbep_uncore_pci_ops,			\
+	.format_group	= &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_link.attr,
+	&format_attr_filter_state2.attr,
+	&format_attr_filter_nid2.attr,
+	&format_attr_filter_opc2.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_c6.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &ivbep_uncore_msr_ops,
+	.format_group	= &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+	EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x10) {
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+	}
+
+	return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		u64 filter = uncore_shared_reg_config(box, 0);
+		wrmsrl(reg1->reg, filter & 0xffffffff);
+		wrmsrl(reg1->reg + 6, filter >> 32);
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+	.init_box		= ivbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= ivbep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= ivbep_cbox_hw_config,
+	.get_constraint		= ivbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 15,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &ivbep_uncore_cbox_ops,
+	.format_group		= &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_pcu_ops,
+	.format_group		= &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+	&ivbep_uncore_ubox,
+	&ivbep_uncore_cbox,
+	&ivbep_uncore_pcu,
+	NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+	if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+			       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= ivbep_uncore_irp_disable_event,
+	.enable_event	= ivbep_uncore_irp_enable_event,
+	.read_counter	= ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= IVBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &ivbep_uncore_irp_ops,
+	.format_group		= &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_qpi_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.hw_config	= snbep_qpi_hw_config,
+	.get_constraint	= uncore_get_constraint,
+	.put_constraint	= uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_qpi_ops,
+	.format_group		= &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	IVBEP_PCI_UNCORE_HA,
+	IVBEP_PCI_UNCORE_IMC,
+	IVBEP_PCI_UNCORE_IRP,
+	IVBEP_PCI_UNCORE_QPI,
+	IVBEP_PCI_UNCORE_R2PCIE,
+	IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+	[IVBEP_PCI_UNCORE_HA]	= &ivbep_uncore_ha,
+	[IVBEP_PCI_UNCORE_IMC]	= &ivbep_uncore_imc,
+	[IVBEP_PCI_UNCORE_IRP]	= &ivbep_uncore_irp,
+	[IVBEP_PCI_UNCORE_QPI]	= &ivbep_uncore_qpi,
+	[IVBEP_PCI_UNCORE_R2PCIE]	= &ivbep_uncore_r2pcie,
+	[IVBEP_PCI_UNCORE_R3QPI]	= &ivbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+	.name		= "ivbep_uncore",
+	.id_table	= ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x0e1e);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = ivbep_pci_uncores;
+	uncore_pci_driver = &ivbep_uncore_pci_driver;
+	return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid4.attr,
+	&format_attr_filter_link3.attr,
+	&format_attr_filter_state4.attr,
+	&format_attr_filter_local.attr,
+	&format_attr_filter_all_op.attr,
+	&format_attr_filter_nnm.attr,
+	&format_attr_filter_opc3.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+	EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x4)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+	return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+			     struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				    struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= knl_cha_hw_config,
+	.get_constraint		= knl_cha_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+	.name			= "cha",
+	.num_counters		= 4,
+	.num_boxes		= 38,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= KNL_CHA_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= knl_uncore_cha_constraints,
+	.ops			= &knl_uncore_cha_ops,
+	.format_group		= &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+	&format_attr_event2.attr,
+	&format_attr_use_occ_ctr.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh6.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge_det.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+	&knl_uncore_ubox,
+	&knl_uncore_cha,
+	&knl_uncore_pcu,
+	NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+							== UNCORE_FIXED_EVENT)
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | KNL_PMON_FIXED_CTL_EN);
+	else
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= knl_uncore_imc_enable_box,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.enable_event	= knl_uncore_imc_enable_event,
+	.disable_event	= snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+	.name			= "imc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+	.name			= "imc",
+	.num_counters		= 4,
+	.num_boxes		= 6,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+	.name			= "edc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+	.name			= "edc_eclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+	.name		= "m2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= knl_uncore_m2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_pci_ops,
+	.format_group		= &knl_uncore_irp_format_group,
+};
+
+enum {
+	KNL_PCI_UNCORE_MC_UCLK,
+	KNL_PCI_UNCORE_MC_DCLK,
+	KNL_PCI_UNCORE_EDC_UCLK,
+	KNL_PCI_UNCORE_EDC_ECLK,
+	KNL_PCI_UNCORE_M2PCIE,
+	KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
+	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
+	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
+	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
+	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
+	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
+	NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *	PCI Device ID	Uncore PMU Devices
+ *	----------------------------------
+ *	0x7841		MC0 UClk, MC1 UClk
+ *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *	0x7817		M2PCIe
+ *	0x7814		IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+	{ /* MC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+	},
+	{ /* MC DClk Channel */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+	},
+	{ /* EDC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+	},
+	{ /* EDC EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+	},
+	{ /* M2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+	.name		= "knl_uncore",
+	.id_table	= knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+	int ret;
+
+	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
+	ret = snb_pci2phy_map_init(0x7814); /* IRP */
+	if (ret)
+		return ret;
+	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+	if (ret)
+		return ret;
+	uncore_pci_uncores = knl_pci_uncores;
+	uncore_pci_driver = &knl_uncore_pci_driver;
+	return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_filter_tid2.attr,
+	&format_attr_filter_cid.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+	reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+	reg1->idx = 0;
+	return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= hswep_ubox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 44,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &hswep_uncore_ubox_ops,
+	.format_group		= &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid3.attr,
+	&format_attr_filter_link2.attr,
+	&format_attr_filter_state3.attr,
+	&format_attr_filter_nid2.attr,
+	&format_attr_filter_opc2.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_c6.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+	EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+	if (fields & 0x1)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x10) {
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+	}
+	return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				  struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		u64 filter = uncore_shared_reg_config(box, 0);
+		wrmsrl(reg1->reg, filter & 0xffffffff);
+		wrmsrl(reg1->reg + 1, filter >> 32);
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= hswep_cbox_hw_config,
+	.get_constraint		= hswep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 18,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= hswep_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr) {
+		u64 init = SNBEP_PMON_BOX_CTL_INT;
+		u64 flags = 0;
+		int i;
+
+		for_each_set_bit(i, (unsigned long *)&init, 64) {
+			flags |= (1ULL << i);
+			wrmsrl(msr, flags);
+		}
+	}
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.init_box		= hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+	if (ev_sel >= 0xb && ev_sel <= 0xe) {
+		reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+		reg1->idx = ev_sel - 0xb;
+		reg1->config = event->attr.config1 & (0xff << reg1->idx);
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= hswep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &hswep_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+	&hswep_uncore_ubox,
+	&hswep_uncore_cbox,
+	&hswep_uncore_sbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+	/* Detect 6-8 core systems with only two SBOXes */
+	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+		u32 capid4;
+
+		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+				      0x94, &capid4);
+		if (((capid4 >> 6) & 0x3) == 0)
+			hswep_uncore_sbox.num_boxes = 2;
+	}
+
+	uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 5,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= ivbep_uncore_irp_disable_event,
+	.enable_event	= ivbep_uncore_irp_enable_event,
+	.read_counter	= hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 5,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= hswep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 4,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 44,
+	.constraints	= hswep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	HSWEP_PCI_UNCORE_HA,
+	HSWEP_PCI_UNCORE_IMC,
+	HSWEP_PCI_UNCORE_IRP,
+	HSWEP_PCI_UNCORE_QPI,
+	HSWEP_PCI_UNCORE_R2PCIE,
+	HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+	[HSWEP_PCI_UNCORE_HA]	= &hswep_uncore_ha,
+	[HSWEP_PCI_UNCORE_IMC]	= &hswep_uncore_imc,
+	[HSWEP_PCI_UNCORE_IRP]	= &hswep_uncore_irp,
+	[HSWEP_PCI_UNCORE_QPI]	= &hswep_uncore_qpi,
+	[HSWEP_PCI_UNCORE_R2PCIE]	= &hswep_uncore_r2pcie,
+	[HSWEP_PCI_UNCORE_R3QPI]	= &hswep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* PCU.3 (for Capability registers) */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   HSWEP_PCI_PCU_3),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+	.name		= "hswep_uncore",
+	.id_table	= hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x2f1e);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = hswep_pci_uncores;
+	uncore_pci_driver = &hswep_uncore_pci_driver;
+	return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 24,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= bdx_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+	&bdx_uncore_ubox,
+	&bdx_uncore_cbox,
+	&bdx_uncore_sbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	BDX_PCI_UNCORE_HA,
+	BDX_PCI_UNCORE_IMC,
+	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_QPI,
+	BDX_PCI_UNCORE_R2PCIE,
+	BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
+	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
+	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
+	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+	},
+	{ /* QPI Port 2 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+	.name		= "bdx_uncore",
+	.id_table	= bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x6f1e);
+
+	if (ret)
+		return ret;
+	uncore_pci_uncores = bdx_pci_uncores;
+	uncore_pci_driver = &bdx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 696d106091db..973b77bb9ceb 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
deleted file mode 100644
index 188e18a1f224..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ /dev/null
@@ -1,3125 +0,0 @@
-/* SandyBridge-EP/IvyTown uncore support */
-#include "../../events/intel/uncore.h"
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
-					 SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
-#define SNBEP_PMON_CTL_RST		(1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
-#define SNBEP_PMON_CTL_EN		(1 << 22)
-#define SNBEP_PMON_CTL_INVERT		(1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_INVERT | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL			0xf4
-#define SNBEP_PCI_PMON_CTL0			0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0			0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0			0xc16
-#define SNBEP_U_MSR_PMON_CTL0			0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0			0xd16
-#define SNBEP_C0_MSR_PMON_CTL0			0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
-#define SNBEP_CBO_MSR_OFFSET			0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
-	.event = (e),				\
-	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
-	.config_mask = (m),			\
-	.idx = (i)				\
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
-
-/* IVBEP event control */
-#define IVBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVBEP_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-/* IVBEP Ubox */
-#define IVBEP_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define IVBEP_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
-
-#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVBEP Cbo */
-#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK		(IVBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
-
-/* IVBEP home agent */
-#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
-#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK		\
-				(IVBEP_PMON_RAW_EVENT_MASK | \
-				 IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVBEP PCU */
-#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVBEP QPI */
-#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(IVBEP_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-				((1ULL << (n)) - 1)))
-
-/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0			0x709
-#define HSWEP_U_MSR_PMON_CTL0			0x705
-#define HSWEP_U_MSR_PMON_FILTER			0x707
-
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL		0x703
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR		0x704
-
-#define HSWEP_U_MSR_PMON_BOX_FILTER_TID		(0x1 << 0)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_CID		(0x1fULL << 1)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
-					(HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
-					 HSWEP_U_MSR_PMON_BOX_FILTER_CID)
-
-/* Haswell-EP CBo */
-#define HSWEP_C0_MSR_PMON_CTR0			0xe08
-#define HSWEP_C0_MSR_PMON_CTL0			0xe01
-#define HSWEP_C0_MSR_PMON_BOX_CTL			0xe00
-#define HSWEP_C0_MSR_PMON_BOX_FILTER0		0xe05
-#define HSWEP_CBO_MSR_OFFSET			0x10
-
-
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x3fULL << 0)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 6)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x7fULL << 17)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
-
-
-/* Haswell-EP Sbox */
-#define HSWEP_S0_MSR_PMON_CTR0			0x726
-#define HSWEP_S0_MSR_PMON_CTL0			0x721
-#define HSWEP_S0_MSR_PMON_BOX_CTL			0x720
-#define HSWEP_SBOX_MSR_OFFSET			0xa
-#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* Haswell-EP PCU */
-#define HSWEP_PCU_MSR_PMON_CTR0			0x717
-#define HSWEP_PCU_MSR_PMON_CTL0			0x711
-#define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
-#define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
-
-/* KNL Ubox */
-#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
-					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
-						SNBEP_CBO_PMON_CTL_TID_EN)
-/* KNL CHA */
-#define KNL_CHA_MSR_OFFSET			0xc
-#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
-#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
-					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
-					 KNL_CHA_MSR_PMON_CTL_QOR)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
-#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
-
-/* KNL EDC/MC UCLK */
-#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
-#define KNL_UCLK_MSR_PMON_CTL0			0x420
-#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
-#define KNL_PMON_FIXED_CTL_EN			0x1
-
-/* KNL EDC */
-#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
-#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
-#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
-
-/* KNL MC */
-#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
-#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
-#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
-#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
-#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
-
-/* KNL IRP */
-#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
-#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
-						 KNL_CHA_MSR_PMON_CTL_QOR)
-/* KNL PCU */
-#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
-#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
-#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
-#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
-				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_CBO_PMON_CTL_TID_EN | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
-DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
-DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
-DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-
-	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE)
-		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-					struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-
-	if (msr)
-		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_nid.attr,
-	&format_attr_filter_state.attr,
-	&format_attr_filter_opc.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),			\
-	.init_box	= snbep_uncore_msr_init_box		\
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
-	.init_box	= snbep_uncore_pci_init_box,		\
-	.disable_box	= snbep_uncore_pci_disable_box,		\
-	.enable_box	= snbep_uncore_pci_enable_box,		\
-	.disable_event	= snbep_uncore_pci_disable_event,	\
-	.read_counter	= snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event	= snbep_uncore_pci_enable_event,	\
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &snbep_uncore_msr_ops,
-	.format_group	= &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-	EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	for (i = 0; i < 5; i++) {
-		if (reg1->alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-			    u64 (*cbox_filter_mask)(int fields))
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i, alloc = 0;
-	unsigned long flags;
-	u64 mask;
-
-	if (reg1->idx == EXTRA_REG_NONE)
-		return NULL;
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	for (i = 0; i < 5; i++) {
-		if (!(reg1->idx & (0x1 << i)))
-			continue;
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			continue;
-
-		mask = cbox_filter_mask(0x1 << i);
-		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-		    !((reg1->config ^ er->config) & mask)) {
-			atomic_add(1 << (i * 6), &er->ref);
-			er->config &= ~mask;
-			er->config |= reg1->config & mask;
-			alloc |= (0x1 << i);
-		} else {
-			break;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	if (i < 5)
-		goto fail;
-
-	if (!uncore_box_is_fake(box))
-		reg1->alloc |= alloc;
-
-	return NULL;
-fail:
-	for (; i >= 0; i--) {
-		if (alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	return &uncore_constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x4)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-	return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_cbox_hw_config,
-	.get_constraint		= snbep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &snbep_uncore_cbox_ops,
-	.format_group		= &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 config = reg1->config;
-
-	if (new_idx > reg1->idx)
-		config <<= 8 * (new_idx - reg1->idx);
-	else
-		config >>= 8 * (reg1->idx - new_idx);
-
-	if (modify) {
-		hwc->config += new_idx - reg1->idx;
-		reg1->config = config;
-		reg1->idx = new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	unsigned long flags;
-	int idx = reg1->idx;
-	u64 mask, config1 = reg1->config;
-	bool ok = false;
-
-	if (reg1->idx == EXTRA_REG_NONE ||
-	    (!uncore_box_is_fake(box) && reg1->alloc))
-		return NULL;
-again:
-	mask = 0xffULL << (idx * 8);
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-	    !((config1 ^ er->config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		er->config &= ~mask;
-		er->config |= config1 & mask;
-		ok = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		idx = (idx + 1) % 4;
-		if (idx != reg1->idx) {
-			config1 = snbep_pcu_alter_er(event, idx, false);
-			goto again;
-		}
-		return &uncore_constraint_empty;
-	}
-
-	if (!uncore_box_is_fake(box)) {
-		if (idx != reg1->idx)
-			snbep_pcu_alter_er(event, idx, true);
-		reg1->alloc = 1;
-	}
-	return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	atomic_sub(1 << (reg1->idx * 8), &er->ref);
-	reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-	if (ev_sel >= 0xb && ev_sel <= 0xe) {
-		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-		reg1->idx = ev_sel - 0xb;
-		reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-	&snbep_uncore_ubox,
-	&snbep_uncore_cbox,
-	&snbep_uncore_pcu,
-	NULL,
-};
-
-void snbep_uncore_cpu_init(void)
-{
-	if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = snbep_msr_uncores;
-}
-
-enum {
-	SNBEP_PCI_QPI_PORT0_FILTER,
-	SNBEP_PCI_QPI_PORT1_FILTER,
-	HSWEP_PCI_PCU_3,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-		reg1->idx = 0;
-		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-		reg1->config = event->attr.config1;
-		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-		reg2->config = event->attr.config2;
-	}
-	return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
-		if (filter_pdev) {
-			pci_write_config_dword(filter_pdev, reg1->reg,
-						(u32)reg1->config);
-			pci_write_config_dword(filter_pdev, reg1->reg + 4,
-						(u32)(reg1->config >> 32));
-			pci_write_config_dword(filter_pdev, reg2->reg,
-						(u32)reg2->config);
-			pci_write_config_dword(filter_pdev, reg2->reg + 4,
-						(u32)(reg2->config >> 32));
-		}
-	}
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event		= snbep_qpi_enable_event,
-	.hw_config		= snbep_qpi_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &snbep_uncore_pci_ops,		\
-	.format_group	= &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= snbep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.event_descs		= snbep_uncore_qpi_events,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	SNBEP_PCI_UNCORE_HA,
-	SNBEP_PCI_UNCORE_IMC,
-	SNBEP_PCI_UNCORE_QPI,
-	SNBEP_PCI_UNCORE_R2PCIE,
-	SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
-	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
-	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
-	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
-	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-	{ /* Home Agent */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* MC Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* QPI Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-	.name		= "snbep_uncore",
-	.id_table	= snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
-	struct pci2phy_map *map;
-	int err = 0;
-	u32 config = 0;
-
-	while (1) {
-		/* find the UBOX device */
-		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-		if (!ubox_dev)
-			break;
-		bus = ubox_dev->bus->number;
-		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, 0x40, &config);
-		if (err)
-			break;
-		nodeid = config;
-		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, 0x54, &config);
-		if (err)
-			break;
-
-		segment = pci_domain_nr(ubox_dev->bus);
-		raw_spin_lock(&pci2phy_map_lock);
-		map = __find_pci2phy_map(segment);
-		if (!map) {
-			raw_spin_unlock(&pci2phy_map_lock);
-			err = -ENOMEM;
-			break;
-		}
-
-		/*
-		 * every three bits in the Node ID mapping register maps
-		 * to a particular node.
-		 */
-		for (i = 0; i < 8; i++) {
-			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
-				break;
-			}
-		}
-		raw_spin_unlock(&pci2phy_map_lock);
-	}
-
-	if (!err) {
-		/*
-		 * For PCI bus with no UBOX device, find the next bus
-		 * that has UBOX device and use its mapping.
-		 */
-		raw_spin_lock(&pci2phy_map_lock);
-		list_for_each_entry(map, &pci2phy_map_head, list) {
-			i = -1;
-			for (bus = 255; bus >= 0; bus--) {
-				if (map->pbus_to_physid[bus] >= 0)
-					i = map->pbus_to_physid[bus];
-				else
-					map->pbus_to_physid[bus] = i;
-			}
-		}
-		raw_spin_unlock(&pci2phy_map_lock);
-	}
-
-	pci_dev_put(ubox_dev);
-
-	return err ? pcibios_err_to_errno(err) : 0;
-}
-
-int snbep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x3ce0);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = snbep_pci_uncores;
-	uncore_pci_driver = &snbep_uncore_pci_driver;
-	return 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	if (msr)
-		wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
-}
-
-static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
-}
-
-#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.init_box	= ivbep_uncore_msr_init_box,		\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops ivbep_uncore_msr_ops = {
-	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivbep_uncore_pci_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_uncore_pci_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-};
-
-#define IVBEP_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= IVBEP_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &ivbep_uncore_pci_ops,			\
-	.format_group	= &ivbep_uncore_format_group
-
-static struct attribute *ivbep_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_link.attr,
-	&format_attr_filter_state2.attr,
-	&format_attr_filter_nid2.attr,
-	&format_attr_filter_opc2.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_c6.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct attribute_group ivbep_uncore_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivbep_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &ivbep_uncore_msr_ops,
-	.format_group	= &ivbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-	EVENT_EXTRA_END
-};
-
-static u64 ivbep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-	if (fields & 0x4)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x10) {
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-	}
-
-	return mask;
-}
-
-static struct event_constraint *
-ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
-}
-
-static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		u64 filter = uncore_shared_reg_config(box, 0);
-		wrmsrl(reg1->reg, filter & 0xffffffff);
-		wrmsrl(reg1->reg + 6, filter >> 32);
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
-	.init_box		= ivbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= ivbep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= ivbep_cbox_hw_config,
-	.get_constraint		= ivbep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 15,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &ivbep_uncore_cbox_ops,
-	.format_group		= &ivbep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
-	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_pcu_ops,
-	.format_group		= &ivbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivbep_msr_uncores[] = {
-	&ivbep_uncore_ubox,
-	&ivbep_uncore_cbox,
-	&ivbep_uncore_pcu,
-	NULL,
-};
-
-void ivbep_uncore_cpu_init(void)
-{
-	if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = ivbep_msr_uncores;
-}
-
-static struct intel_uncore_type ivbep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= snbep_uncore_imc_events,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
-			       hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static struct intel_uncore_ops ivbep_uncore_irp_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= ivbep_uncore_irp_disable_event,
-	.enable_event	= ivbep_uncore_irp_enable_event,
-	.read_counter	= ivbep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivbep_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= IVBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &ivbep_uncore_irp_ops,
-	.format_group		= &ivbep_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_qpi_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-	.hw_config	= snbep_qpi_hw_config,
-	.get_constraint	= uncore_get_constraint,
-	.put_constraint	= uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_qpi_ops,
-	.format_group		= &ivbep_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivbep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	IVBEP_PCI_UNCORE_HA,
-	IVBEP_PCI_UNCORE_IMC,
-	IVBEP_PCI_UNCORE_IRP,
-	IVBEP_PCI_UNCORE_QPI,
-	IVBEP_PCI_UNCORE_R2PCIE,
-	IVBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivbep_pci_uncores[] = {
-	[IVBEP_PCI_UNCORE_HA]	= &ivbep_uncore_ha,
-	[IVBEP_PCI_UNCORE_IMC]	= &ivbep_uncore_imc,
-	[IVBEP_PCI_UNCORE_IRP]	= &ivbep_uncore_irp,
-	[IVBEP_PCI_UNCORE_QPI]	= &ivbep_uncore_qpi,
-	[IVBEP_PCI_UNCORE_R2PCIE]	= &ivbep_uncore_r2pcie,
-	[IVBEP_PCI_UNCORE_R3QPI]	= &ivbep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id ivbep_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver ivbep_uncore_pci_driver = {
-	.name		= "ivbep_uncore",
-	.id_table	= ivbep_uncore_pci_ids,
-};
-
-int ivbep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x0e1e);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = ivbep_pci_uncores;
-	uncore_pci_driver = &ivbep_uncore_pci_driver;
-	return 0;
-}
-/* end of IvyTown uncore support */
-
-/* KNL uncore support */
-static struct attribute *knl_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops			= &snbep_uncore_msr_ops,
-	.format_group		= &knl_uncore_ubox_format_group,
-};
-
-static struct attribute *knl_uncore_cha_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_qor.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid4.attr,
-	&format_attr_filter_link3.attr,
-	&format_attr_filter_state4.attr,
-	&format_attr_filter_local.attr,
-	&format_attr_filter_all_op.attr,
-	&format_attr_filter_nnm.attr,
-	&format_attr_filter_opc3.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_cha_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_cha_formats_attr,
-};
-
-static struct event_constraint knl_uncore_cha_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg knl_uncore_cha_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
-	EVENT_EXTRA_END
-};
-
-static u64 knl_cha_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x4)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
-	return mask;
-}
-
-static struct event_constraint *
-knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
-}
-
-static int knl_cha_hw_config(struct intel_uncore_box *box,
-			     struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-				    struct perf_event *event);
-
-static struct intel_uncore_ops knl_uncore_cha_ops = {
-	.init_box		= snbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= hswep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= knl_cha_hw_config,
-	.get_constraint		= knl_cha_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type knl_uncore_cha = {
-	.name			= "cha",
-	.num_counters		= 4,
-	.num_boxes		= 38,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= KNL_CHA_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= knl_uncore_cha_constraints,
-	.ops			= &knl_uncore_cha_ops,
-	.format_group		= &knl_uncore_cha_format_group,
-};
-
-static struct attribute *knl_uncore_pcu_formats_attr[] = {
-	&format_attr_event2.attr,
-	&format_attr_use_occ_ctr.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh6.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge_det.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_pcu_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
-	.ops			= &snbep_uncore_msr_ops,
-	.format_group		= &knl_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *knl_msr_uncores[] = {
-	&knl_uncore_ubox,
-	&knl_uncore_cha,
-	&knl_uncore_pcu,
-	NULL,
-};
-
-void knl_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = knl_msr_uncores;
-}
-
-static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-
-	pci_write_config_dword(pdev, box_ctl, 0);
-}
-
-static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
-					struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
-							== UNCORE_FIXED_EVENT)
-		pci_write_config_dword(pdev, hwc->config_base,
-				       hwc->config | KNL_PMON_FIXED_CTL_EN);
-	else
-		pci_write_config_dword(pdev, hwc->config_base,
-				       hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops knl_uncore_imc_ops = {
-	.init_box	= snbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= knl_uncore_imc_enable_box,
-	.read_counter	= snbep_uncore_pci_read_counter,
-	.enable_event	= knl_uncore_imc_enable_event,
-	.disable_event	= snbep_uncore_pci_disable_event,
-};
-
-static struct intel_uncore_type knl_uncore_imc_uclk = {
-	.name			= "imc_uclk",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_imc_dclk = {
-	.name			= "imc",
-	.num_counters		= 4,
-	.num_boxes		= 6,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
-	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
-	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_uclk = {
-	.name			= "edc_uclk",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_eclk = {
-	.name			= "edc_eclk",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
-	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
-	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct event_constraint knl_uncore_m2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type knl_uncore_m2pcie = {
-	.name		= "m2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= knl_uncore_m2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct attribute *knl_uncore_irp_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_qor.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_irp_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_irp_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
-	.ops			= &snbep_uncore_pci_ops,
-	.format_group		= &knl_uncore_irp_format_group,
-};
-
-enum {
-	KNL_PCI_UNCORE_MC_UCLK,
-	KNL_PCI_UNCORE_MC_DCLK,
-	KNL_PCI_UNCORE_EDC_UCLK,
-	KNL_PCI_UNCORE_EDC_ECLK,
-	KNL_PCI_UNCORE_M2PCIE,
-	KNL_PCI_UNCORE_IRP,
-};
-
-static struct intel_uncore_type *knl_pci_uncores[] = {
-	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
-	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
-	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
-	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
-	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
-	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
-	NULL,
-};
-
-/*
- * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
- * device type. prior to KNL, each instance of a PMU device type had a unique
- * device ID.
- *
- *	PCI Device ID	Uncore PMU Devices
- *	----------------------------------
- *	0x7841		MC0 UClk, MC1 UClk
- *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
- *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
- *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
- *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
- *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
- *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
- *	0x7817		M2PCIe
- *	0x7814		IRP
-*/
-
-static const struct pci_device_id knl_uncore_pci_ids[] = {
-	{ /* MC UClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
-	},
-	{ /* MC DClk Channel */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
-	},
-	{ /* EDC UClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
-	},
-	{ /* EDC EClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
-	},
-	{ /* M2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver knl_uncore_pci_driver = {
-	.name		= "knl_uncore",
-	.id_table	= knl_uncore_pci_ids,
-};
-
-int knl_uncore_pci_init(void)
-{
-	int ret;
-
-	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
-	ret = snb_pci2phy_map_init(0x7814); /* IRP */
-	if (ret)
-		return ret;
-	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
-	if (ret)
-		return ret;
-	uncore_pci_uncores = knl_pci_uncores;
-	uncore_pci_driver = &knl_uncore_pci_driver;
-	return 0;
-}
-
-/* end of KNL uncore support */
-
-/* Haswell-EP uncore support */
-static struct attribute *hswep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_filter_tid2.attr,
-	&format_attr_filter_cid.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_ubox_formats_attr,
-};
-
-static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	reg1->reg = HSWEP_U_MSR_PMON_FILTER;
-	reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
-	reg1->idx = 0;
-	return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_ubox_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= hswep_ubox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 44,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &hswep_uncore_ubox_ops,
-	.format_group		= &hswep_uncore_ubox_format_group,
-};
-
-static struct attribute *hswep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid3.attr,
-	&format_attr_filter_link2.attr,
-	&format_attr_filter_state3.attr,
-	&format_attr_filter_nid2.attr,
-	&format_attr_filter_opc2.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_c6.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_cbox_formats_attr,
-};
-
-static struct event_constraint hswep_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-	EVENT_EXTRA_END
-};
-
-static u64 hswep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-	if (fields & 0x1)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-	if (fields & 0x4)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x10) {
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-	}
-	return mask;
-}
-
-static struct event_constraint *
-hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
-}
-
-static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-			    HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-				  struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		u64 filter = uncore_shared_reg_config(box, 0);
-		wrmsrl(reg1->reg, filter & 0xffffffff);
-		wrmsrl(reg1->reg + 1, filter >> 32);
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops hswep_uncore_cbox_ops = {
-	.init_box		= snbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= hswep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= hswep_cbox_hw_config,
-	.get_constraint		= hswep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 18,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= hswep_uncore_cbox_constraints,
-	.ops			= &hswep_uncore_cbox_ops,
-	.format_group		= &hswep_uncore_cbox_format_group,
-};
-
-/*
- * Write SBOX Initialization register bit by bit to avoid spurious #GPs
- */
-static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-
-	if (msr) {
-		u64 init = SNBEP_PMON_BOX_CTL_INT;
-		u64 flags = 0;
-		int i;
-
-		for_each_set_bit(i, (unsigned long *)&init, 64) {
-			flags |= (1ULL << i);
-			wrmsrl(msr, flags);
-		}
-	}
-}
-
-static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
-	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.init_box		= hswep_uncore_sbox_msr_init_box
-};
-
-static struct attribute *hswep_uncore_sbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_sbox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_type hswep_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 4,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
-	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
-	.ops			= &hswep_uncore_sbox_msr_ops,
-	.format_group		= &hswep_uncore_sbox_format_group,
-};
-
-static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-	if (ev_sel >= 0xb && ev_sel <= 0xe) {
-		reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
-		reg1->idx = ev_sel - 0xb;
-		reg1->config = event->attr.config1 & (0xff << reg1->idx);
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_pcu_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= hswep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &hswep_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *hswep_msr_uncores[] = {
-	&hswep_uncore_ubox,
-	&hswep_uncore_cbox,
-	&hswep_uncore_sbox,
-	&hswep_uncore_pcu,
-	NULL,
-};
-
-void hswep_uncore_cpu_init(void)
-{
-	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-
-	/* Detect 6-8 core systems with only two SBOXes */
-	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
-		u32 capid4;
-
-		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
-				      0x94, &capid4);
-		if (((capid4 >> 6) & 0x3) == 0)
-			hswep_uncore_sbox.num_boxes = 2;
-	}
-
-	uncore_msr_uncores = hswep_msr_uncores;
-}
-
-static struct intel_uncore_type hswep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 5,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct uncore_event_desc hswep_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type hswep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 5,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= hswep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
-
-static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static struct intel_uncore_ops hswep_uncore_irp_ops = {
-	.init_box	= snbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= ivbep_uncore_irp_disable_event,
-	.enable_event	= ivbep_uncore_irp_enable_event,
-	.read_counter	= hswep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type hswep_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &hswep_uncore_irp_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type hswep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 5,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= hswep_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 4,
-	.num_boxes	= 3,
-	.perf_ctr_bits	= 44,
-	.constraints	= hswep_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	HSWEP_PCI_UNCORE_HA,
-	HSWEP_PCI_UNCORE_IMC,
-	HSWEP_PCI_UNCORE_IRP,
-	HSWEP_PCI_UNCORE_QPI,
-	HSWEP_PCI_UNCORE_R2PCIE,
-	HSWEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *hswep_pci_uncores[] = {
-	[HSWEP_PCI_UNCORE_HA]	= &hswep_uncore_ha,
-	[HSWEP_PCI_UNCORE_IMC]	= &hswep_uncore_imc,
-	[HSWEP_PCI_UNCORE_IRP]	= &hswep_uncore_irp,
-	[HSWEP_PCI_UNCORE_QPI]	= &hswep_uncore_qpi,
-	[HSWEP_PCI_UNCORE_R2PCIE]	= &hswep_uncore_r2pcie,
-	[HSWEP_PCI_UNCORE_R3QPI]	= &hswep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id hswep_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 1 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* PCU.3 (for Capability registers) */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   HSWEP_PCI_PCU_3),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver hswep_uncore_pci_driver = {
-	.name		= "hswep_uncore",
-	.id_table	= hswep_uncore_pci_ids,
-};
-
-int hswep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x2f1e);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = hswep_pci_uncores;
-	uncore_pci_driver = &hswep_uncore_pci_driver;
-	return 0;
-}
-/* end of Haswell-EP uncore support */
-
-/* BDX uncore support */
-
-static struct intel_uncore_type bdx_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_msr_ops,
-	.format_group		= &ivbep_uncore_ubox_format_group,
-};
-
-static struct event_constraint bdx_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 24,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= bdx_uncore_cbox_constraints,
-	.ops			= &hswep_uncore_cbox_ops,
-	.format_group		= &hswep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 4,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
-	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
-	.ops			= &hswep_uncore_sbox_msr_ops,
-	.format_group		= &hswep_uncore_sbox_format_group,
-};
-
-static struct intel_uncore_type *bdx_msr_uncores[] = {
-	&bdx_uncore_ubox,
-	&bdx_uncore_cbox,
-	&bdx_uncore_sbox,
-	&hswep_uncore_pcu,
-	NULL,
-};
-
-void bdx_uncore_cpu_init(void)
-{
-	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = bdx_msr_uncores;
-}
-
-static struct intel_uncore_type bdx_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 5,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= hswep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &hswep_uncore_irp_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= bdx_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 3,
-	.perf_ctr_bits	= 48,
-	.constraints	= bdx_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	BDX_PCI_UNCORE_HA,
-	BDX_PCI_UNCORE_IMC,
-	BDX_PCI_UNCORE_IRP,
-	BDX_PCI_UNCORE_QPI,
-	BDX_PCI_UNCORE_R2PCIE,
-	BDX_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *bdx_pci_uncores[] = {
-	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
-	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
-	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
-	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
-	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
-	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id bdx_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
-	},
-	{ /* QPI Port 1 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
-	},
-	{ /* QPI Port 2 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver bdx_uncore_pci_driver = {
-	.name		= "bdx_uncore",
-	.id_table	= bdx_uncore_pci_ids,
-};
-
-int bdx_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x6f1e);
-
-	if (ret)
-		return ret;
-	uncore_pci_uncores = bdx_pci_uncores;
-	uncore_pci_driver = &bdx_uncore_pci_driver;
-	return 0;
-}
-
-/* end of BDX uncore support */
-- 
cgit 


From edbb591870dc8d1fd855014ea5360475f7bc46af Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:19 +0100
Subject: perf/x86: Move perf_event_knc.c .............. =>
 x86/events/intel/knc.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-14-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile             |   4 +-
 arch/x86/events/intel/knc.c          | 319 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile         |   2 +-
 arch/x86/kernel/cpu/perf_event_knc.c | 319 -----------------------------------
 4 files changed, 322 insertions(+), 322 deletions(-)
 create mode 100644 arch/x86/events/intel/knc.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_knc.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 8c939ca26f6d..c1647a65bf1c 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,7 +6,7 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
new file mode 100644
index 000000000000..630bcba91b16
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,319 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x002a,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x0016,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0028,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0029,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0012,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		/* On Xeon Phi event "0" is a valid DATA_READ          */
+		/*   (L1 Data Cache Reads) Instruction.                */
+		/* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
+		/* bit will always be set in x86_pmu_hw_config().      */
+		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+						/* DATA_READ           */
+		[ C(RESULT_MISS)   ] = 0x0003,	/* DATA_READ_MISS      */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE          */
+		[ C(RESULT_MISS)   ] = 0x0004,	/* DATA_WRITE_MISS     */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0011,	/* L1_DATA_PF1         */
+		[ C(RESULT_MISS)   ] = 0x001c,	/* L1_DATA_PF1_MISS    */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ          */
+		[ C(RESULT_MISS)   ] = 0x000e,	/* CODE_CACHE_MISS    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x10cb,	/* L2_READ_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10cc,	/* L2_WRITE_HIT */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10fc,	/* L2_DATA_PF2      */
+		[ C(RESULT_MISS)   ] = 0x10fe,	/* L2_DATA_PF2_MISS */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+						/* DATA_READ */
+						/* see note on L1 OP_READ */
+		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE */
+		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ */
+		[ C(RESULT_MISS)   ] = 0x000d,	/* CODE_PAGE_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0012,	/* BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x002b,	/* BRANCHES_MISPREDICTED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+	return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xc3, 0x1),	/* HWP_L2HIT */
+	INTEL_EVENT_CONSTRAINT(0xc4, 0x1),	/* HWP_L2MISS */
+	INTEL_EVENT_CONSTRAINT(0xc8, 0x1),	/* L2_READ_HIT_E */
+	INTEL_EVENT_CONSTRAINT(0xc9, 0x1),	/* L2_READ_HIT_M */
+	INTEL_EVENT_CONSTRAINT(0xca, 0x1),	/* L2_READ_HIT_S */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),	/* L2_READ_MISS */
+	INTEL_EVENT_CONSTRAINT(0xcc, 0x1),	/* L2_WRITE_HIT */
+	INTEL_EVENT_CONSTRAINT(0xce, 0x1),	/* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+	INTEL_EVENT_CONSTRAINT(0xcf, 0x1),	/* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+	INTEL_EVENT_CONSTRAINT(0xd7, 0x1),	/* L2_VICTIM_REQ_WITH_DATA */
+	INTEL_EVENT_CONSTRAINT(0xe3, 0x1),	/* SNP_HITM_BUNIT */
+	INTEL_EVENT_CONSTRAINT(0xe6, 0x1),	/* SNP_HIT_L2 */
+	INTEL_EVENT_CONSTRAINT(0xe7, 0x1),	/* SNP_HITM_L2 */
+	INTEL_EVENT_CONSTRAINT(0xf1, 0x1),	/* L2_DATA_READ_MISS_CACHE_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf2, 0x1),	/* L2_DATA_WRITE_MISS_CACHE_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf6, 0x1),	/* L2_DATA_READ_MISS_MEM_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf7, 0x1),	/* L2_DATA_WRITE_MISS_MEM_FILL */
+	INTEL_EVENT_CONSTRAINT(0xfc, 0x1),	/* L2_DATA_PF2 */
+	INTEL_EVENT_CONSTRAINT(0xfd, 0x1),	/* L2_DATA_PF2_DROP */
+	INTEL_EVENT_CONSTRAINT(0xfe, 0x1),	/* L2_DATA_PF2_MISS */
+	INTEL_EVENT_CONSTRAINT(0xff, 0x1),	/* L2_DATA_HIT_INFLIGHT_PF2 */
+	EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS		0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL	0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL		0x0000002f
+
+#define KNC_ENABLE_COUNTER0			0x00000001
+#define KNC_ENABLE_COUNTER1			0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+	val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+	u64 val;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+	val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	int bit, loops;
+	u64 status;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	knc_pmu_disable_all();
+
+	status = knc_pmu_get_status();
+	if (!status) {
+		knc_pmu_enable_all(0);
+		return handled;
+	}
+
+	loops = 0;
+again:
+	knc_pmu_ack_status(status);
+	if (++loops > 100) {
+		WARN_ONCE(1, "perf: irq loop stuck!\n");
+		perf_event_print_debug();
+		goto done;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = knc_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	knc_pmu_enable_all(0);
+
+	return handled;
+}
+
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_knc_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+	.name			= "knc",
+	.handle_irq		= knc_pmu_handle_irq,
+	.disable_all		= knc_pmu_disable_all,
+	.enable_all		= knc_pmu_enable_all,
+	.enable			= knc_pmu_enable_event,
+	.disable		= knc_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_KNC_EVNTSEL0,
+	.perfctr		= MSR_KNC_PERFCTR0,
+	.event_map		= knc_pmu_event_map,
+	.max_events             = ARRAY_SIZE(knc_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 39) - 1,
+	.version		= 0,
+	.num_counters		= 2,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
+	.get_event_constraints	= x86_get_event_constraints,
+	.event_constraints	= knc_event_constraints,
+	.format_attrs		= intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+	x86_pmu = knc_pmu;
+
+	memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
+		sizeof(hw_cache_event_ids));
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 973b77bb9ceb..3195f7d06529 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
deleted file mode 100644
index 5b0c232d1ee6..000000000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x002a,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x0016,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0028,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0029,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0012,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		/* On Xeon Phi event "0" is a valid DATA_READ          */
-		/*   (L1 Data Cache Reads) Instruction.                */
-		/* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
-		/* bit will always be set in x86_pmu_hw_config().      */
-		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-						/* DATA_READ           */
-		[ C(RESULT_MISS)   ] = 0x0003,	/* DATA_READ_MISS      */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE          */
-		[ C(RESULT_MISS)   ] = 0x0004,	/* DATA_WRITE_MISS     */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0011,	/* L1_DATA_PF1         */
-		[ C(RESULT_MISS)   ] = 0x001c,	/* L1_DATA_PF1_MISS    */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ          */
-		[ C(RESULT_MISS)   ] = 0x000e,	/* CODE_CACHE_MISS    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x10cb,	/* L2_READ_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x10cc,	/* L2_WRITE_HIT */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x10fc,	/* L2_DATA_PF2      */
-		[ C(RESULT_MISS)   ] = 0x10fe,	/* L2_DATA_PF2_MISS */
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-						/* DATA_READ */
-						/* see note on L1 OP_READ */
-		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE */
-		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ */
-		[ C(RESULT_MISS)   ] = 0x000d,	/* CODE_PAGE_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0012,	/* BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x002b,	/* BRANCHES_MISPREDICTED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
-	return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
-	INTEL_EVENT_CONSTRAINT(0xc3, 0x1),	/* HWP_L2HIT */
-	INTEL_EVENT_CONSTRAINT(0xc4, 0x1),	/* HWP_L2MISS */
-	INTEL_EVENT_CONSTRAINT(0xc8, 0x1),	/* L2_READ_HIT_E */
-	INTEL_EVENT_CONSTRAINT(0xc9, 0x1),	/* L2_READ_HIT_M */
-	INTEL_EVENT_CONSTRAINT(0xca, 0x1),	/* L2_READ_HIT_S */
-	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),	/* L2_READ_MISS */
-	INTEL_EVENT_CONSTRAINT(0xcc, 0x1),	/* L2_WRITE_HIT */
-	INTEL_EVENT_CONSTRAINT(0xce, 0x1),	/* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
-	INTEL_EVENT_CONSTRAINT(0xcf, 0x1),	/* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
-	INTEL_EVENT_CONSTRAINT(0xd7, 0x1),	/* L2_VICTIM_REQ_WITH_DATA */
-	INTEL_EVENT_CONSTRAINT(0xe3, 0x1),	/* SNP_HITM_BUNIT */
-	INTEL_EVENT_CONSTRAINT(0xe6, 0x1),	/* SNP_HIT_L2 */
-	INTEL_EVENT_CONSTRAINT(0xe7, 0x1),	/* SNP_HITM_L2 */
-	INTEL_EVENT_CONSTRAINT(0xf1, 0x1),	/* L2_DATA_READ_MISS_CACHE_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf2, 0x1),	/* L2_DATA_WRITE_MISS_CACHE_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf6, 0x1),	/* L2_DATA_READ_MISS_MEM_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf7, 0x1),	/* L2_DATA_WRITE_MISS_MEM_FILL */
-	INTEL_EVENT_CONSTRAINT(0xfc, 0x1),	/* L2_DATA_PF2 */
-	INTEL_EVENT_CONSTRAINT(0xfd, 0x1),	/* L2_DATA_PF2_DROP */
-	INTEL_EVENT_CONSTRAINT(0xfe, 0x1),	/* L2_DATA_PF2_MISS */
-	INTEL_EVENT_CONSTRAINT(0xff, 0x1),	/* L2_DATA_HIT_INFLIGHT_PF2 */
-	EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS		0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL	0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL		0x0000002f
-
-#define KNC_ENABLE_COUNTER0			0x00000001
-#define KNC_ENABLE_COUNTER1			0x00000002
-
-static void knc_pmu_disable_all(void)
-{
-	u64 val;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-	val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
-	u64 val;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-	val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
-	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
-	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int handled = 0;
-	int bit, loops;
-	u64 status;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	knc_pmu_disable_all();
-
-	status = knc_pmu_get_status();
-	if (!status) {
-		knc_pmu_enable_all(0);
-		return handled;
-	}
-
-	loops = 0;
-again:
-	knc_pmu_ack_status(status);
-	if (++loops > 100) {
-		WARN_ONCE(1, "perf: irq loop stuck!\n");
-		perf_event_print_debug();
-		goto done;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-
-	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_event *event = cpuc->events[bit];
-
-		handled++;
-
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = knc_pmu_get_status();
-	if (status)
-		goto again;
-
-done:
-	knc_pmu_enable_all(0);
-
-	return handled;
-}
-
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *intel_knc_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
-	.name			= "knc",
-	.handle_irq		= knc_pmu_handle_irq,
-	.disable_all		= knc_pmu_disable_all,
-	.enable_all		= knc_pmu_enable_all,
-	.enable			= knc_pmu_enable_event,
-	.disable		= knc_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_KNC_EVNTSEL0,
-	.perfctr		= MSR_KNC_PERFCTR0,
-	.event_map		= knc_pmu_event_map,
-	.max_events             = ARRAY_SIZE(knc_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 39) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	.cntval_bits		= 40,
-	.cntval_mask		= (1ULL << 40) - 1,
-	.get_event_constraints	= x86_get_event_constraints,
-	.event_constraints	= knc_event_constraints,
-	.format_attrs		= intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
-	x86_pmu = knc_pmu;
-
-	memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
-		sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-- 
cgit 


From f03e97dbd2bb7d9436400fb6946502268ab33542 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:20 +0100
Subject: perf/x86: Move perf_event_p4.c ............... =>
 x86/events/intel/p4.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile            |    3 +-
 arch/x86/events/intel/p4.c          | 1376 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile        |    2 +-
 arch/x86/kernel/cpu/perf_event_p4.c | 1376 -----------------------------------
 4 files changed, 1379 insertions(+), 1378 deletions(-)
 create mode 100644 arch/x86/events/intel/p4.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_p4.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index c1647a65bf1c..5a6f20d3f3ea 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/pt.o intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
new file mode 100644
index 000000000000..1c72fedd96d3
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+	unsigned int opcode;			/* Event code and ESCR selector */
+	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned int escr_emask;		/* valid ESCR EventMask bits */
+	unsigned int shared;			/* event is shared across threads */
+	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+	unsigned int metric_pebs;
+	unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
+	[P4_PEBS_METRIC__##name] = {				\
+		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
+		.metric_vert = vert,				\
+	}
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
+	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
+	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
+	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
+	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
+	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
+	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+	[P4_EVENT_TC_DELIVER_MODE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+		.shared		= 1,
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_BPU_FETCH_REQUEST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_ITLB_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_MEMORY_CANCEL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MEMORY_COMPLETE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_LOAD_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_STORE_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MOB_LOAD_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_PAGE_WALK_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+		.shared		= 1,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ALLOCATION] = {
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_FSB_DATA_ACTIVITY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+		.shared		= 1,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+		.cntr		= { {0, -1, -1}, {1, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_SSE_INPUT_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_64BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_128BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_X87_FP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_TC_MISC] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
+		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_TC_MS_XFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_UOP_QUEUE_WRITES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RESOURCE_STALL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_WC_BUFFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_B2B_CYCLES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BNR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BNR),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_SNOOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_RESPONSE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_FRONT_END_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_EXECUTION_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_REPLAY_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOPS_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOP_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_X87_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MACHINE_CLEAR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_COMPLETED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
+	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
+			    P4_ESCR_EMASK_BIT(event, bit))		| \
+	p4_config_pack_cccr(metric					| \
+			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+	},
+},
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__dtlb_load_miss_retired),
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__dtlb_store_miss_retired),
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+						P4_PEBS_METRIC__none),
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+						P4_PEBS_METRIC__none),
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+	u64 original;
+	u64 alternative;
+} p4_event_aliases[] = {
+	{
+		/*
+		 * Non-halted cycles can be substituted with non-sleeping cycles (see
+		 * Intel SDM Vol3b for details). We need this alias to be able
+		 * to run nmi-watchdog and 'perf top' (or any other user space tool
+		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+		 * simultaneously.
+		 */
+	.original	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+	.alternative	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+				    P4_CCCR_COMPARE),
+	},
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+	u64 config_match;
+	int i;
+
+	/*
+	 * Only event with special mark is allowed,
+	 * we're to be sure it didn't come as malformed
+	 * RAW event.
+	 */
+	if (!(config & P4_CONFIG_ALIASABLE))
+		return 0;
+
+	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+		if (config_match == p4_event_aliases[i].original) {
+			config_match = p4_event_aliases[i].alternative;
+			break;
+		} else if (config_match == p4_event_aliases[i].alternative) {
+			config_match = p4_event_aliases[i].original;
+			break;
+		}
+	}
+
+	if (i >= ARRAY_SIZE(p4_event_aliases))
+		return 0;
+
+	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
+		P4_CONFIG_ALIASABLE,
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]	=
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
+	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+	unsigned int evnt = p4_config_unpack_event(config);
+	struct p4_event_bind *bind = NULL;
+
+	if (evnt < ARRAY_SIZE(p4_event_bind_map))
+		bind = &p4_event_bind_map[evnt];
+
+	return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+	struct p4_event_bind *bind;
+	unsigned int esel;
+	u64 config;
+
+	config = p4_general_events[hw_event];
+	bind = p4_config_get_bind(config);
+	esel = P4_OPCODE_ESEL(bind->opcode);
+	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+	return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+		if (boot_cpu_data.x86_model != 3 &&
+			boot_cpu_data.x86_model != 4 &&
+			boot_cpu_data.x86_model != 6)
+			return false;
+	}
+
+	/*
+	 * For info
+	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+	 */
+
+	return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+	unsigned int v, emask;
+
+	/* User data may have out-of-bound event index */
+	v = p4_config_unpack_event(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_event_bind_map))
+		return -EINVAL;
+
+	/* It may be unsupported: */
+	if (!p4_event_match_cpu_model(v))
+		return -EINVAL;
+
+	/*
+	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+	 * in Architectural Performance Monitoring, it means not
+	 * on _which_ logical cpu to count but rather _when_, ie it
+	 * depends on logical cpu state -- count event if one cpu active,
+	 * none, both or any, so we just allow user to pass any value
+	 * desired.
+	 *
+	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
+	 * cpu without their propagation to another cpu
+	 */
+
+	/*
+	 * if an event is shared across the logical threads
+	 * the user needs special permissions to be able to use it
+	 */
+	if (p4_ht_active() && p4_event_bind_map[v].shared) {
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
+	/* ESCR EventMask bits may be invalid */
+	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+	if (emask & ~p4_event_bind_map[v].escr_emask)
+		return -EINVAL;
+
+	/*
+	 * it may have some invalid PEBS bits
+	 */
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+		return -EINVAL;
+
+	v = p4_config_unpack_metric(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+	int cpu = get_cpu();
+	int rc = 0;
+	u32 escr, cccr;
+
+	/*
+	 * the reason we use cpu that early is that: if we get scheduled
+	 * first time on the same cpu -- we will not need swap thread
+	 * specific flags in config (and will save some cpu cycles)
+	 */
+
+	cccr = p4_default_cccr_conf(cpu);
+	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+					 event->attr.exclude_user);
+	event->hw.config = p4_config_pack_escr(escr) |
+			   p4_config_pack_cccr(cccr);
+
+	if (p4_ht_active() && p4_ht_thread(cpu))
+		event->hw.config = p4_set_ht_bit(event->hw.config);
+
+	if (event->attr.type == PERF_TYPE_RAW) {
+		struct p4_event_bind *bind;
+		unsigned int esel;
+		/*
+		 * Clear bits we reserve to be managed by kernel itself
+		 * and never allowed from a user space
+		 */
+		 event->attr.config &= P4_CONFIG_MASK;
+
+		rc = p4_validate_raw_event(event);
+		if (rc)
+			goto out;
+
+		/*
+		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+		 * bits since we keep additional info here (for cache events and etc)
+		 */
+		event->hw.config |= event->attr.config;
+		bind = p4_config_get_bind(event->attr.config);
+		if (!bind) {
+			rc = -EINVAL;
+			goto out;
+		}
+		esel = P4_OPCODE_ESEL(bind->opcode);
+		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+	}
+
+	rc = x86_setup_perfctr(event);
+out:
+	put_cpu();
+	return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+	u64 v;
+
+	/* an official way for overflow indication */
+	rdmsrl(hwc->config_base, v);
+	if (v & P4_CCCR_OVF) {
+		wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+		return 1;
+	}
+
+	/*
+	 * In some circumstances the overflow might issue an NMI but did
+	 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+	 * we simply check for high bit being set, if it's cleared it means
+	 * the counter has reached zero value and continued counting before
+	 * real NMI signal was received:
+	 */
+	rdmsrl(hwc->event_base, v);
+	if (!(v & ARCH_P4_UNFLAGGED_BIT))
+		return 1;
+
+	return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+	/*
+	 * FIXME
+	 *
+	 * It's still allowed that two threads setup same cache
+	 * events so we can't simply clear metrics until we knew
+	 * no one is depending on us, so we need kind of counter
+	 * for "ReplayEvent" users.
+	 *
+	 * What is more complex -- RAW events, if user (for some
+	 * reason) will pass some cache event metric with improper
+	 * event opcode -- it's fine from hardware point of view
+	 * but completely nonsense from "meaning" of such action.
+	 *
+	 * So at moment let leave metrics turned on forever -- it's
+	 * ok for now but need to be revisited!
+	 *
+	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+	 */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * If event gets disabled while counter is in overflowed
+	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+	 * asserted again and again
+	 */
+	(void)wrmsrl_safe(hwc->config_base,
+		p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_disable_event(event);
+	}
+
+	p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+	struct p4_pebs_bind *bind;
+	unsigned int idx;
+
+	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+	idx = p4_config_unpack_metric(config);
+	if (idx == P4_PEBS_METRIC__none)
+		return;
+
+	bind = &p4_pebs_bind_map[idx];
+
+	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
+	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int thread = p4_ht_config_thread(hwc->config);
+	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+	unsigned int idx = p4_config_unpack_event(hwc->config);
+	struct p4_event_bind *bind;
+	u64 escr_addr, cccr;
+
+	bind = &p4_event_bind_map[idx];
+	escr_addr = bind->escr_msr[thread];
+
+	/*
+	 * - we dont support cascaded counters yet
+	 * - and counter 1 is broken (erratum)
+	 */
+	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+	WARN_ON_ONCE(hwc->idx == 1);
+
+	/* we need a real Event value */
+	escr_conf &= ~P4_ESCR_EVENT_MASK;
+	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	/*
+	 * it could be Cache event so we need to write metrics
+	 * into additional MSRs
+	 */
+	p4_pmu_enable_pebs(hwc->config);
+
+	(void)wrmsrl_safe(escr_addr, escr_conf);
+	(void)wrmsrl_safe(hwc->config_base,
+				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_enable_event(event);
+	}
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int overflow;
+
+		if (!test_bit(idx, cpuc->active_mask)) {
+			/* catch in-flight IRQs */
+			if (__test_and_clear_bit(idx, cpuc->running))
+				handled++;
+			continue;
+		}
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		/* it might be unflagged overflow */
+		overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+		val = x86_perf_event_update(event);
+		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+			continue;
+
+		handled += overflow;
+
+		/* event overflow for sure */
+		perf_sample_data_init(&data, 0, hwc->last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+	 * been observed that the OVF bit flag has to be cleared first _before_
+	 * the LVTPC can be unmasked.
+	 *
+	 * The reason is the NMI line will continue to be asserted while the OVF
+	 * bit is set.  This causes a second NMI to generate if the LVTPC is
+	 * unmasked before the OVF bit is cleared, leading to unknown NMI
+	 * messages.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+	u32 escr, cccr;
+
+	/*
+	 * we either lucky and continue on same cpu or no HT support
+	 */
+	if (!p4_should_swap_ts(hwc->config, cpu))
+		return;
+
+	/*
+	 * the event is migrated from an another logical
+	 * cpu, so we need to swap thread specific flags
+	 */
+
+	escr = p4_config_unpack_escr(hwc->config);
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	if (p4_ht_thread(cpu)) {
+		cccr &= ~P4_CCCR_OVF_PMI_T0;
+		cccr |= P4_CCCR_OVF_PMI_T1;
+		if (escr & P4_ESCR_T0_OS) {
+			escr &= ~P4_ESCR_T0_OS;
+			escr |= P4_ESCR_T1_OS;
+		}
+		if (escr & P4_ESCR_T0_USR) {
+			escr &= ~P4_ESCR_T0_USR;
+			escr |= P4_ESCR_T1_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config |= P4_CONFIG_HT;
+	} else {
+		cccr &= ~P4_CCCR_OVF_PMI_T1;
+		cccr |= P4_CCCR_OVF_PMI_T0;
+		if (escr & P4_ESCR_T1_OS) {
+			escr &= ~P4_ESCR_T1_OS;
+			escr |= P4_ESCR_T0_OS;
+		}
+		if (escr & P4_ESCR_T1_USR) {
+			escr &= ~P4_ESCR_T1_USR;
+			escr |= P4_ESCR_T0_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config &= ~P4_CONFIG_HT;
+	}
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE		0x000003a0
+#define P4_ESCR_MSR_MAX			0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+	unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
+			!p4_escr_table[idx]		||
+			p4_escr_table[idx] != addr)) {
+		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+		return -1;
+	}
+
+	return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+			struct p4_event_bind *bind)
+{
+	int i, j;
+
+	for (i = 0; i < P4_CNTR_LIMIT; i++) {
+		j = bind->cntr[thread][i];
+		if (j != -1 && !test_bit(j, used_mask))
+			return j;
+	}
+
+	return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+	int cpu = smp_processor_id();
+	struct hw_perf_event *hwc;
+	struct p4_event_bind *bind;
+	unsigned int i, thread, num;
+	int cntr_idx, escr_idx;
+	u64 config_alias;
+	int pass;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+	for (i = 0, num = n; i < n; i++, num--) {
+
+		hwc = &cpuc->event_list[i]->hw;
+		thread = p4_ht_thread(cpu);
+		pass = 0;
+
+again:
+		/*
+		 * It's possible to hit a circular lock
+		 * between original and alternative events
+		 * if both are scheduled already.
+		 */
+		if (pass > 2)
+			goto done;
+
+		bind = p4_config_get_bind(hwc->config);
+		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+		if (unlikely(escr_idx == -1))
+			goto done;
+
+		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			cntr_idx = hwc->idx;
+			if (assign)
+				assign[i] = hwc->idx;
+			goto reserve;
+		}
+
+		cntr_idx = p4_next_cntr(thread, used_mask, bind);
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+			/*
+			 * Check whether an event alias is still available.
+			 */
+			config_alias = p4_get_alias_event(hwc->config);
+			if (!config_alias)
+				goto done;
+			hwc->config = config_alias;
+			pass++;
+			goto again;
+		}
+		/*
+		 * Perf does test runs to see if a whole group can be assigned
+		 * together succesfully.  There can be multiple rounds of this.
+		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+		 * bits, such that the next round of group assignments will
+		 * cause the above p4_should_swap_ts to pass instead of fail.
+		 * This leads to counters exclusive to thread0 being used by
+		 * thread1.
+		 *
+		 * Solve this with a cheap hack, reset the idx back to -1 to
+		 * force a new lookup (p4_next_cntr) to get the right counter
+		 * for the right thread.
+		 *
+		 * This probably doesn't comply with the general spirit of how
+		 * perf wants to work, but P4 is special. :-(
+		 */
+		if (p4_should_swap_ts(hwc->config, cpu))
+			hwc->idx = -1;
+		p4_pmu_swap_config_ts(hwc, cpu);
+		if (assign)
+			assign[i] = cntr_idx;
+reserve:
+		set_bit(cntr_idx, used_mask);
+		set_bit(escr_idx, escr_mask);
+	}
+
+done:
+	return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+	&format_attr_cccr.attr,
+	&format_attr_escr.attr,
+	&format_attr_ht.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+	.name			= "Netburst P4/Xeon",
+	.handle_irq		= p4_pmu_handle_irq,
+	.disable_all		= p4_pmu_disable_all,
+	.enable_all		= p4_pmu_enable_all,
+	.enable			= p4_pmu_enable_event,
+	.disable		= p4_pmu_disable_event,
+	.eventsel		= MSR_P4_BPU_CCCR0,
+	.perfctr		= MSR_P4_BPU_PERFCTR0,
+	.event_map		= p4_pmu_event_map,
+	.max_events		= ARRAY_SIZE(p4_general_events),
+	.get_event_constraints	= x86_get_event_constraints,
+	/*
+	 * IF HT disabled we may need to use all
+	 * ARCH_P4_MAX_CCCR counters simulaneously
+	 * though leave it restricted at moment assuming
+	 * HT is on
+	 */
+	.num_counters		= ARCH_P4_MAX_CCCR,
+	.apic			= 1,
+	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
+	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
+	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+	.hw_config		= p4_hw_config,
+	.schedule_events	= p4_pmu_schedule_events,
+	/*
+	 * This handles erratum N15 in intel doc 249199-029,
+	 * the counter may not be updated correctly on write
+	 * so we need a second write operation to do the trick
+	 * (the official workaround didn't work)
+	 *
+	 * the former idea is taken from OProfile code
+	 */
+	.perfctr_second_write	= 1,
+
+	.format_attrs		= intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+	unsigned int low, high;
+	int i, reg;
+
+	/* If we get stripped -- indexing fails */
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!(low & (1 << 7))) {
+		pr_cont("unsupported Netburst CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+		sizeof(hw_cache_event_ids));
+
+	pr_cont("Netburst events, ");
+
+	x86_pmu = p4_pmu;
+
+	/*
+	 * Even though the counters are configured to interrupt a particular
+	 * logical processor when an overflow happens, testing has shown that
+	 * on kdump kernels (which uses a single cpu), thread1's counter
+	 * continues to run and will report an NMI on thread0.  Due to the
+	 * overflow bug, this leads to a stream of unknown NMIs.
+	 *
+	 * Solve this by zero'ing out the registers to mimic a reset.
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		reg = x86_pmu_config_addr(i);
+		wrmsrl_safe(reg, 0ULL);
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3195f7d06529..2bb41d02e45b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
deleted file mode 100644
index f2e56783af3d..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
-	unsigned int opcode;			/* Event code and ESCR selector */
-	unsigned int escr_msr[2];		/* ESCR MSR for this event */
-	unsigned int escr_emask;		/* valid ESCR EventMask bits */
-	unsigned int shared;			/* event is shared across threads */
-	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
-	unsigned int metric_pebs;
-	unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
-	[P4_PEBS_METRIC__##name] = {				\
-		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
-		.metric_vert = vert,				\
-	}
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
-	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
-	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
-	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
-	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
-	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
-	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
-	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
-	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
-	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
-	[P4_EVENT_TC_DELIVER_MODE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
-		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
-		.shared		= 1,
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_BPU_FETCH_REQUEST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
-		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_ITLB_REFERENCE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_MEMORY_CANCEL] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
-		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_MEMORY_COMPLETE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_LOAD_PORT_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_STORE_PORT_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_MOB_LOAD_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
-		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_PAGE_WALK_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
-		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
-		.shared		= 1,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_IOQ_ALLOCATION] = {
-		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
-		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
-		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
-		.cntr		= { {2, -1, -1}, {3, -1, -1} },
-	},
-	[P4_EVENT_FSB_DATA_ACTIVITY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
-		.shared		= 1,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
-		.cntr		= { {0, -1, -1}, {1, -1, -1} },
-	},
-	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
-		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
-		.cntr		= { {2, -1, -1}, {3, -1, -1} },
-	},
-	[P4_EVENT_SSE_INPUT_ASSIST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_PACKED_SP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_PACKED_DP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_SCALAR_SP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_SCALAR_DP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_64BIT_MMX_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_128BIT_MMX_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_X87_FP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_TC_MISC] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
-		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
-		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_TC_MS_XFER] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
-		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_UOP_QUEUE_WRITES] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
-		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RESOURCE_STALL] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
-		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_WC_BUFFER] = {
-		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
-		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_B2B_CYCLES] = {
-		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BNR] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BNR),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_SNOOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_RESPONSE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_FRONT_END_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_EXECUTION_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_REPLAY_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_INSTR_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_UOPS_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_UOP_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_BRANCH_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_X87_ASSIST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_MACHINE_CLEAR] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_INSTR_COMPLETED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
-	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
-			    P4_ESCR_EMASK_BIT(event, bit))		| \
-	p4_config_pack_cccr(metric					| \
-			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
-	},
-},
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__dtlb_load_miss_retired),
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__dtlb_store_miss_retired),
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-						P4_PEBS_METRIC__none),
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-						P4_PEBS_METRIC__none),
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
-	u64 original;
-	u64 alternative;
-} p4_event_aliases[] = {
-	{
-		/*
-		 * Non-halted cycles can be substituted with non-sleeping cycles (see
-		 * Intel SDM Vol3b for details). We need this alias to be able
-		 * to run nmi-watchdog and 'perf top' (or any other user space tool
-		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
-		 * simultaneously.
-		 */
-	.original	=
-		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-	.alternative	=
-		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
-		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
-				    P4_CCCR_COMPARE),
-	},
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
-	u64 config_match;
-	int i;
-
-	/*
-	 * Only event with special mark is allowed,
-	 * we're to be sure it didn't come as malformed
-	 * RAW event.
-	 */
-	if (!(config & P4_CONFIG_ALIASABLE))
-		return 0;
-
-	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
-	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
-		if (config_match == p4_event_aliases[i].original) {
-			config_match = p4_event_aliases[i].alternative;
-			break;
-		} else if (config_match == p4_event_aliases[i].alternative) {
-			config_match = p4_event_aliases[i].original;
-			break;
-		}
-	}
-
-	if (i >= ARRAY_SIZE(p4_event_aliases))
-		return 0;
-
-	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
-  /* non-halted CPU clocks */
-  [PERF_COUNT_HW_CPU_CYCLES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
-		P4_CONFIG_ALIASABLE,
-
-  /*
-   * retired instructions
-   * in a sake of simplicity we don't use the FSB tagging
-   */
-  [PERF_COUNT_HW_INSTRUCTIONS] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
-  /* cache hits */
-  [PERF_COUNT_HW_CACHE_REFERENCES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
-  /* cache misses */
-  [PERF_COUNT_HW_CACHE_MISSES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
-  /* branch instructions retired */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
-  /* mispredicted branches retired */
-  [PERF_COUNT_HW_BRANCH_MISSES]	=
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
-  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-  [PERF_COUNT_HW_BUS_CYCLES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
-	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
-	unsigned int evnt = p4_config_unpack_event(config);
-	struct p4_event_bind *bind = NULL;
-
-	if (evnt < ARRAY_SIZE(p4_event_bind_map))
-		bind = &p4_event_bind_map[evnt];
-
-	return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
-	struct p4_event_bind *bind;
-	unsigned int esel;
-	u64 config;
-
-	config = p4_general_events[hw_event];
-	bind = p4_config_get_bind(config);
-	esel = P4_OPCODE_ESEL(bind->opcode);
-	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
-	return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
-	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
-	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
-		if (boot_cpu_data.x86_model != 3 &&
-			boot_cpu_data.x86_model != 4 &&
-			boot_cpu_data.x86_model != 6)
-			return false;
-	}
-
-	/*
-	 * For info
-	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
-	 */
-
-	return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
-	unsigned int v, emask;
-
-	/* User data may have out-of-bound event index */
-	v = p4_config_unpack_event(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_event_bind_map))
-		return -EINVAL;
-
-	/* It may be unsupported: */
-	if (!p4_event_match_cpu_model(v))
-		return -EINVAL;
-
-	/*
-	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
-	 * in Architectural Performance Monitoring, it means not
-	 * on _which_ logical cpu to count but rather _when_, ie it
-	 * depends on logical cpu state -- count event if one cpu active,
-	 * none, both or any, so we just allow user to pass any value
-	 * desired.
-	 *
-	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
-	 * cpu without their propagation to another cpu
-	 */
-
-	/*
-	 * if an event is shared across the logical threads
-	 * the user needs special permissions to be able to use it
-	 */
-	if (p4_ht_active() && p4_event_bind_map[v].shared) {
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
-	/* ESCR EventMask bits may be invalid */
-	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
-	if (emask & ~p4_event_bind_map[v].escr_emask)
-		return -EINVAL;
-
-	/*
-	 * it may have some invalid PEBS bits
-	 */
-	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-		return -EINVAL;
-
-	v = p4_config_unpack_metric(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
-	int cpu = get_cpu();
-	int rc = 0;
-	u32 escr, cccr;
-
-	/*
-	 * the reason we use cpu that early is that: if we get scheduled
-	 * first time on the same cpu -- we will not need swap thread
-	 * specific flags in config (and will save some cpu cycles)
-	 */
-
-	cccr = p4_default_cccr_conf(cpu);
-	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
-					 event->attr.exclude_user);
-	event->hw.config = p4_config_pack_escr(escr) |
-			   p4_config_pack_cccr(cccr);
-
-	if (p4_ht_active() && p4_ht_thread(cpu))
-		event->hw.config = p4_set_ht_bit(event->hw.config);
-
-	if (event->attr.type == PERF_TYPE_RAW) {
-		struct p4_event_bind *bind;
-		unsigned int esel;
-		/*
-		 * Clear bits we reserve to be managed by kernel itself
-		 * and never allowed from a user space
-		 */
-		 event->attr.config &= P4_CONFIG_MASK;
-
-		rc = p4_validate_raw_event(event);
-		if (rc)
-			goto out;
-
-		/*
-		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
-		 * bits since we keep additional info here (for cache events and etc)
-		 */
-		event->hw.config |= event->attr.config;
-		bind = p4_config_get_bind(event->attr.config);
-		if (!bind) {
-			rc = -EINVAL;
-			goto out;
-		}
-		esel = P4_OPCODE_ESEL(bind->opcode);
-		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-	}
-
-	rc = x86_setup_perfctr(event);
-out:
-	put_cpu();
-	return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
-	u64 v;
-
-	/* an official way for overflow indication */
-	rdmsrl(hwc->config_base, v);
-	if (v & P4_CCCR_OVF) {
-		wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-		return 1;
-	}
-
-	/*
-	 * In some circumstances the overflow might issue an NMI but did
-	 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
-	 * we simply check for high bit being set, if it's cleared it means
-	 * the counter has reached zero value and continued counting before
-	 * real NMI signal was received:
-	 */
-	rdmsrl(hwc->event_base, v);
-	if (!(v & ARCH_P4_UNFLAGGED_BIT))
-		return 1;
-
-	return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
-	/*
-	 * FIXME
-	 *
-	 * It's still allowed that two threads setup same cache
-	 * events so we can't simply clear metrics until we knew
-	 * no one is depending on us, so we need kind of counter
-	 * for "ReplayEvent" users.
-	 *
-	 * What is more complex -- RAW events, if user (for some
-	 * reason) will pass some cache event metric with improper
-	 * event opcode -- it's fine from hardware point of view
-	 * but completely nonsense from "meaning" of such action.
-	 *
-	 * So at moment let leave metrics turned on forever -- it's
-	 * ok for now but need to be revisited!
-	 *
-	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
-	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
-	 */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	/*
-	 * If event gets disabled while counter is in overflowed
-	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
-	 * asserted again and again
-	 */
-	(void)wrmsrl_safe(hwc->config_base,
-		p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_event *event = cpuc->events[idx];
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		p4_pmu_disable_event(event);
-	}
-
-	p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
-	struct p4_pebs_bind *bind;
-	unsigned int idx;
-
-	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
-	idx = p4_config_unpack_metric(config);
-	if (idx == P4_PEBS_METRIC__none)
-		return;
-
-	bind = &p4_pebs_bind_map[idx];
-
-	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
-	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int thread = p4_ht_config_thread(hwc->config);
-	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-	unsigned int idx = p4_config_unpack_event(hwc->config);
-	struct p4_event_bind *bind;
-	u64 escr_addr, cccr;
-
-	bind = &p4_event_bind_map[idx];
-	escr_addr = bind->escr_msr[thread];
-
-	/*
-	 * - we dont support cascaded counters yet
-	 * - and counter 1 is broken (erratum)
-	 */
-	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
-	WARN_ON_ONCE(hwc->idx == 1);
-
-	/* we need a real Event value */
-	escr_conf &= ~P4_ESCR_EVENT_MASK;
-	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
-	cccr = p4_config_unpack_cccr(hwc->config);
-
-	/*
-	 * it could be Cache event so we need to write metrics
-	 * into additional MSRs
-	 */
-	p4_pmu_enable_pebs(hwc->config);
-
-	(void)wrmsrl_safe(escr_addr, escr_conf);
-	(void)wrmsrl_safe(hwc->config_base,
-				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_event *event = cpuc->events[idx];
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		p4_pmu_enable_event(event);
-	}
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	struct perf_event *event;
-	struct hw_perf_event *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int overflow;
-
-		if (!test_bit(idx, cpuc->active_mask)) {
-			/* catch in-flight IRQs */
-			if (__test_and_clear_bit(idx, cpuc->running))
-				handled++;
-			continue;
-		}
-
-		event = cpuc->events[idx];
-		hwc = &event->hw;
-
-		WARN_ON_ONCE(hwc->idx != idx);
-
-		/* it might be unflagged overflow */
-		overflow = p4_pmu_clear_cccr_ovf(hwc);
-
-		val = x86_perf_event_update(event);
-		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
-			continue;
-
-		handled += overflow;
-
-		/* event overflow for sure */
-		perf_sample_data_init(&data, 0, hwc->last_period);
-
-		if (!x86_perf_event_set_period(event))
-			continue;
-
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	/*
-	 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
-	 * been observed that the OVF bit flag has to be cleared first _before_
-	 * the LVTPC can be unmasked.
-	 *
-	 * The reason is the NMI line will continue to be asserted while the OVF
-	 * bit is set.  This causes a second NMI to generate if the LVTPC is
-	 * unmasked before the OVF bit is cleared, leading to unknown NMI
-	 * messages.
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
-	u32 escr, cccr;
-
-	/*
-	 * we either lucky and continue on same cpu or no HT support
-	 */
-	if (!p4_should_swap_ts(hwc->config, cpu))
-		return;
-
-	/*
-	 * the event is migrated from an another logical
-	 * cpu, so we need to swap thread specific flags
-	 */
-
-	escr = p4_config_unpack_escr(hwc->config);
-	cccr = p4_config_unpack_cccr(hwc->config);
-
-	if (p4_ht_thread(cpu)) {
-		cccr &= ~P4_CCCR_OVF_PMI_T0;
-		cccr |= P4_CCCR_OVF_PMI_T1;
-		if (escr & P4_ESCR_T0_OS) {
-			escr &= ~P4_ESCR_T0_OS;
-			escr |= P4_ESCR_T1_OS;
-		}
-		if (escr & P4_ESCR_T0_USR) {
-			escr &= ~P4_ESCR_T0_USR;
-			escr |= P4_ESCR_T1_USR;
-		}
-		hwc->config  = p4_config_pack_escr(escr);
-		hwc->config |= p4_config_pack_cccr(cccr);
-		hwc->config |= P4_CONFIG_HT;
-	} else {
-		cccr &= ~P4_CCCR_OVF_PMI_T1;
-		cccr |= P4_CCCR_OVF_PMI_T0;
-		if (escr & P4_ESCR_T1_OS) {
-			escr &= ~P4_ESCR_T1_OS;
-			escr |= P4_ESCR_T0_OS;
-		}
-		if (escr & P4_ESCR_T1_USR) {
-			escr &= ~P4_ESCR_T1_USR;
-			escr |= P4_ESCR_T0_USR;
-		}
-		hwc->config  = p4_config_pack_escr(escr);
-		hwc->config |= p4_config_pack_cccr(cccr);
-		hwc->config &= ~P4_CONFIG_HT;
-	}
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE		0x000003a0
-#define P4_ESCR_MSR_MAX			0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
-	unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
-	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
-			!p4_escr_table[idx]		||
-			p4_escr_table[idx] != addr)) {
-		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
-		return -1;
-	}
-
-	return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
-			struct p4_event_bind *bind)
-{
-	int i, j;
-
-	for (i = 0; i < P4_CNTR_LIMIT; i++) {
-		j = bind->cntr[thread][i];
-		if (j != -1 && !test_bit(j, used_mask))
-			return j;
-	}
-
-	return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-	int cpu = smp_processor_id();
-	struct hw_perf_event *hwc;
-	struct p4_event_bind *bind;
-	unsigned int i, thread, num;
-	int cntr_idx, escr_idx;
-	u64 config_alias;
-	int pass;
-
-	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
-	for (i = 0, num = n; i < n; i++, num--) {
-
-		hwc = &cpuc->event_list[i]->hw;
-		thread = p4_ht_thread(cpu);
-		pass = 0;
-
-again:
-		/*
-		 * It's possible to hit a circular lock
-		 * between original and alternative events
-		 * if both are scheduled already.
-		 */
-		if (pass > 2)
-			goto done;
-
-		bind = p4_config_get_bind(hwc->config);
-		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
-		if (unlikely(escr_idx == -1))
-			goto done;
-
-		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
-			cntr_idx = hwc->idx;
-			if (assign)
-				assign[i] = hwc->idx;
-			goto reserve;
-		}
-
-		cntr_idx = p4_next_cntr(thread, used_mask, bind);
-		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-			/*
-			 * Check whether an event alias is still available.
-			 */
-			config_alias = p4_get_alias_event(hwc->config);
-			if (!config_alias)
-				goto done;
-			hwc->config = config_alias;
-			pass++;
-			goto again;
-		}
-		/*
-		 * Perf does test runs to see if a whole group can be assigned
-		 * together succesfully.  There can be multiple rounds of this.
-		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
-		 * bits, such that the next round of group assignments will
-		 * cause the above p4_should_swap_ts to pass instead of fail.
-		 * This leads to counters exclusive to thread0 being used by
-		 * thread1.
-		 *
-		 * Solve this with a cheap hack, reset the idx back to -1 to
-		 * force a new lookup (p4_next_cntr) to get the right counter
-		 * for the right thread.
-		 *
-		 * This probably doesn't comply with the general spirit of how
-		 * perf wants to work, but P4 is special. :-(
-		 */
-		if (p4_should_swap_ts(hwc->config, cpu))
-			hwc->idx = -1;
-		p4_pmu_swap_config_ts(hwc, cpu);
-		if (assign)
-			assign[i] = cntr_idx;
-reserve:
-		set_bit(cntr_idx, used_mask);
-		set_bit(escr_idx, escr_mask);
-	}
-
-done:
-	return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht,   "config:63"   );
-
-static struct attribute *intel_p4_formats_attr[] = {
-	&format_attr_cccr.attr,
-	&format_attr_escr.attr,
-	&format_attr_ht.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
-	.name			= "Netburst P4/Xeon",
-	.handle_irq		= p4_pmu_handle_irq,
-	.disable_all		= p4_pmu_disable_all,
-	.enable_all		= p4_pmu_enable_all,
-	.enable			= p4_pmu_enable_event,
-	.disable		= p4_pmu_disable_event,
-	.eventsel		= MSR_P4_BPU_CCCR0,
-	.perfctr		= MSR_P4_BPU_PERFCTR0,
-	.event_map		= p4_pmu_event_map,
-	.max_events		= ARRAY_SIZE(p4_general_events),
-	.get_event_constraints	= x86_get_event_constraints,
-	/*
-	 * IF HT disabled we may need to use all
-	 * ARCH_P4_MAX_CCCR counters simulaneously
-	 * though leave it restricted at moment assuming
-	 * HT is on
-	 */
-	.num_counters		= ARCH_P4_MAX_CCCR,
-	.apic			= 1,
-	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
-	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
-	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
-	.hw_config		= p4_hw_config,
-	.schedule_events	= p4_pmu_schedule_events,
-	/*
-	 * This handles erratum N15 in intel doc 249199-029,
-	 * the counter may not be updated correctly on write
-	 * so we need a second write operation to do the trick
-	 * (the official workaround didn't work)
-	 *
-	 * the former idea is taken from OProfile code
-	 */
-	.perfctr_second_write	= 1,
-
-	.format_attrs		= intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
-	unsigned int low, high;
-	int i, reg;
-
-	/* If we get stripped -- indexing fails */
-	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
-	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-	if (!(low & (1 << 7))) {
-		pr_cont("unsupported Netburst CPU model %d ",
-			boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-		sizeof(hw_cache_event_ids));
-
-	pr_cont("Netburst events, ");
-
-	x86_pmu = p4_pmu;
-
-	/*
-	 * Even though the counters are configured to interrupt a particular
-	 * logical processor when an overflow happens, testing has shown that
-	 * on kdump kernels (which uses a single cpu), thread1's counter
-	 * continues to run and will report an NMI on thread0.  Due to the
-	 * overflow bug, this leads to a stream of unknown NMIs.
-	 *
-	 * Solve this by zero'ing out the registers to mimic a reset.
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		reg = x86_pmu_config_addr(i);
-		wrmsrl_safe(reg, 0ULL);
-	}
-
-	return 0;
-}
-- 
cgit 


From 5e865ed44b39fa991a8fb1d7d9c0338f8fedac33 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:21 +0100
Subject: perf/x86: Move perf_event_p6.c ............... =>
 x86/events/intel/p6.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile            |   2 +-
 arch/x86/events/intel/p6.c          | 279 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile        |   2 -
 arch/x86/kernel/cpu/perf_event_p6.c | 279 ------------------------------------
 4 files changed, 280 insertions(+), 282 deletions(-)
 create mode 100644 arch/x86/events/intel/p6.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_p6.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 5a6f20d3f3ea..b4e80d02a78a 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
new file mode 100644
index 000000000000..ee5c4e8bd556
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,279 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,	/* CPU_CLK_UNHALTED */
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,	/* INST_RETIRED     */
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,	/* L2_RQSTS:M:E:S:I */
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,	/* L2_RQSTS:I       */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,	/* BR_INST_RETIRED  */
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,	/* BR_MISS_PRED_RETIRED */
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,	/* BUS_DRDY_CLOCKS  */
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,	/* RESOURCE_STALLS  */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS       */
+                [ C(RESULT_MISS)   ] = 0x0045,	/* DCU_LINES_IN        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0f29,	/* L2_LD:M:E:S:I       */
+	},
+        [ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+        },
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
+		[ C(RESULT_MISS)   ] = 0x0f28,	/* L2_IFETCH:M:E:S:I  */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0025,	/* L2_M_LINES_INM     */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
+		[ C(RESULT_MISS)   ] = 0x0085,	/* ITLB_MISS          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4,	/* BR_INST_RETIRED      */
+		[ C(RESULT_MISS)   ] = 0x00c5,	/* BR_MISS_PRED_RETIRED */
+        },
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+	return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT			0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
+	INTEL_EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
+	EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+	u64 val;
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+	unsigned long val;
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val = P6_NOP_EVENT;
+
+	(void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+
+	/*
+	 * p6 only has a global event enable, set on PerfEvtSel0
+	 * We "disable" events by programming P6_NOP_EVENT
+	 * and we rely on p6_pmu_enable_all() being called
+	 * to actually enable the events.
+	 */
+
+	(void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_p6_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+	.name			= "p6",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= p6_pmu_disable_all,
+	.enable_all		= p6_pmu_enable_all,
+	.enable			= p6_pmu_enable_event,
+	.disable		= p6_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_P6_EVNTSEL0,
+	.perfctr		= MSR_P6_PERFCTR0,
+	.event_map		= p6_pmu_event_map,
+	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 31) - 1,
+	.version		= 0,
+	.num_counters		= 2,
+	/*
+	 * Events have 40 bits implemented. However they are designed such
+	 * that bits [32-39] are sign extensions of bit 31. As such the
+	 * effective width of a event for P6-like PMU is 32 bits only.
+	 *
+	 * See IA-32 Intel Architecture Software developer manual Vol 3B
+	 */
+	.cntval_bits		= 32,
+	.cntval_mask		= (1ULL << 32) - 1,
+	.get_event_constraints	= x86_get_event_constraints,
+	.event_constraints	= p6_event_constraints,
+
+	.format_attrs		= intel_p6_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+	if (boot_cpu_data.x86_mask < 9) {
+		/*
+		 * PPro erratum 26; fixed in stepping 9 and above.
+		 */
+		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+		x86_pmu.attr_rdpmc_broken = 1;
+		x86_pmu.attr_rdpmc = 0;
+	}
+}
+
+__init int p6_pmu_init(void)
+{
+	x86_pmu = p6_pmu;
+
+	switch (boot_cpu_data.x86_model) {
+	case  1: /* Pentium Pro */
+		x86_add_quirk(p6_pmu_rdpmc_quirk);
+		break;
+
+	case  3: /* Pentium II - Klamath */
+	case  5: /* Pentium II - Deschutes */
+	case  6: /* Pentium II - Mendocino */
+		break;
+
+	case  7: /* Pentium III - Katmai */
+	case  8: /* Pentium III - Coppermine */
+	case 10: /* Pentium III Xeon */
+	case 11: /* Pentium III - Tualatin */
+		break;
+
+	case  9: /* Pentium M - Banias */
+	case 13: /* Pentium M - Dothan */
+		break;
+
+	default:
+		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+		sizeof(hw_cache_event_ids));
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 2bb41d02e45b..5cc09fc6ae60 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,8 +31,6 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o
-
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
deleted file mode 100644
index 7c1a0c07b607..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,	/* CPU_CLK_UNHALTED */
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,	/* INST_RETIRED     */
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,	/* L2_RQSTS:M:E:S:I */
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,	/* L2_RQSTS:I       */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,	/* BR_INST_RETIRED  */
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,	/* BR_MISS_PRED_RETIRED */
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,	/* BUS_DRDY_CLOCKS  */
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,	/* RESOURCE_STALLS  */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS       */
-                [ C(RESULT_MISS)   ] = 0x0045,	/* DCU_LINES_IN        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0f29,	/* L2_LD:M:E:S:I       */
-	},
-        [ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-        },
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
-		[ C(RESULT_MISS)   ] = 0x0f28,	/* L2_IFETCH:M:E:S:I  */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0025,	/* L2_M_LINES_INM     */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
-		[ C(RESULT_MISS)   ] = 0x0085,	/* ITLB_MISS          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4,	/* BR_INST_RETIRED      */
-		[ C(RESULT_MISS)   ] = 0x00c5,	/* BR_MISS_PRED_RETIRED */
-        },
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-	return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT			0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
-	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
-	INTEL_EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
-	EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
-	u64 val;
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
-	unsigned long val;
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val = P6_NOP_EVENT;
-
-	(void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-
-	/*
-	 * p6 only has a global event enable, set on PerfEvtSel0
-	 * We "disable" events by programming P6_NOP_EVENT
-	 * and we rely on p6_pmu_enable_all() being called
-	 * to actually enable the events.
-	 */
-
-	(void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(pc,	"config:19"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *intel_p6_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
-	.name			= "p6",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= p6_pmu_disable_all,
-	.enable_all		= p6_pmu_enable_all,
-	.enable			= p6_pmu_enable_event,
-	.disable		= p6_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_P6_EVNTSEL0,
-	.perfctr		= MSR_P6_PERFCTR0,
-	.event_map		= p6_pmu_event_map,
-	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 31) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	/*
-	 * Events have 40 bits implemented. However they are designed such
-	 * that bits [32-39] are sign extensions of bit 31. As such the
-	 * effective width of a event for P6-like PMU is 32 bits only.
-	 *
-	 * See IA-32 Intel Architecture Software developer manual Vol 3B
-	 */
-	.cntval_bits		= 32,
-	.cntval_mask		= (1ULL << 32) - 1,
-	.get_event_constraints	= x86_get_event_constraints,
-	.event_constraints	= p6_event_constraints,
-
-	.format_attrs		= intel_p6_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-};
-
-static __init void p6_pmu_rdpmc_quirk(void)
-{
-	if (boot_cpu_data.x86_mask < 9) {
-		/*
-		 * PPro erratum 26; fixed in stepping 9 and above.
-		 */
-		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
-		x86_pmu.attr_rdpmc_broken = 1;
-		x86_pmu.attr_rdpmc = 0;
-	}
-}
-
-__init int p6_pmu_init(void)
-{
-	x86_pmu = p6_pmu;
-
-	switch (boot_cpu_data.x86_model) {
-	case  1: /* Pentium Pro */
-		x86_add_quirk(p6_pmu_rdpmc_quirk);
-		break;
-
-	case  3: /* Pentium II - Klamath */
-	case  5: /* Pentium II - Deschutes */
-	case  6: /* Pentium II - Mendocino */
-		break;
-
-	case  7: /* Pentium III - Katmai */
-	case  8: /* Pentium III - Coppermine */
-	case 10: /* Pentium III Xeon */
-	case 11: /* Pentium III - Tualatin */
-		break;
-
-	case  9: /* Pentium M - Banias */
-	case 13: /* Pentium M - Dothan */
-		break;
-
-	default:
-		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
-		sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-- 
cgit 


From 65a27a3510c82f16e673548b4c819462fabb12ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:22 +0100
Subject: perf/x86: Move perf_event_msr.c .............. => x86/events/msr.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-17-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/Makefile             |   4 +-
 arch/x86/events/msr.c                | 241 +++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile         |   6 -
 arch/x86/kernel/cpu/perf_event_msr.c | 241 -----------------------------------
 4 files changed, 243 insertions(+), 249 deletions(-)
 create mode 100644 arch/x86/events/msr.c
 delete mode 100644 arch/x86/kernel/cpu/perf_event_msr.c

(limited to 'arch/x86')

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index b4e80d02a78a..fdfea1511cc0 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,13 +1,13 @@
 obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
-obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o msr.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
new file mode 100644
index 000000000000..ec863b9a9f78
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+	PERF_MSR_TSC			= 0,
+	PERF_MSR_APERF			= 1,
+	PERF_MSR_MPERF			= 2,
+	PERF_MSR_PPERF			= 3,
+	PERF_MSR_SMI			= 4,
+
+	PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_MSR_SMI)
+			return true;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+struct perf_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group events_attr_group = {
+	.name = "events",
+	.attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+static struct attribute_group format_attr_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (!msr[cfg].attr)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	event->hw.event_base = msr[cfg].msr;
+	event->hw.config = cfg;
+
+	return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+	u64 now;
+
+	if (event->hw.event_base)
+		rdmsrl(event->hw.event_base, now);
+	else
+		rdtscll(now);
+
+	return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+	u64 prev, now;
+	s64 delta;
+
+	/* Careful, an NMI might modify the previous event value. */
+again:
+	prev = local64_read(&event->hw.prev_count);
+	now = msr_read_counter(event);
+
+	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+		delta = sign_extend64(delta, 31);
+
+	local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+	u64 now;
+
+	now = msr_read_counter(event);
+	local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+	msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+	msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		msr_event_start(event, flags);
+
+	return 0;
+}
+
+static struct pmu pmu_msr = {
+	.task_ctx_nr	= perf_sw_context,
+	.attr_groups	= attr_groups,
+	.event_init	= msr_event_init,
+	.add		= msr_event_add,
+	.del		= msr_event_del,
+	.start		= msr_event_start,
+	.stop		= msr_event_stop,
+	.read		= msr_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+	int i, j = 0;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
+		pr_cont("no MSR PMU driver.\n");
+		return 0;
+	}
+
+	/* Probe the MSRs. */
+	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+		u64 val;
+
+		/*
+		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+		 */
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining MSRs in the sysfs attrs. */
+	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	perf_pmu_register(&pmu_msr, "msr", -1);
+
+	return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5cc09fc6ae60..7a60424d63fa 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,12 +30,6 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
-endif
-
-
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
deleted file mode 100644
index ec863b9a9f78..000000000000
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <linux/perf_event.h>
-
-enum perf_msr_id {
-	PERF_MSR_TSC			= 0,
-	PERF_MSR_APERF			= 1,
-	PERF_MSR_MPERF			= 2,
-	PERF_MSR_PPERF			= 3,
-	PERF_MSR_SMI			= 4,
-
-	PERF_MSR_EVENT_MAX,
-};
-
-static bool test_aperfmperf(int idx)
-{
-	return boot_cpu_has(X86_FEATURE_APERFMPERF);
-}
-
-static bool test_intel(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_MSR_SMI)
-			return true;
-		break;
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-struct perf_msr {
-	u64	msr;
-	struct	perf_pmu_events_attr *attr;
-	bool	(*test)(int idx);
-};
-
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
-
-static struct perf_msr msr[] = {
-	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
-	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
-	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
-	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
-	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
-};
-
-static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group events_attr_group = {
-	.name = "events",
-	.attrs = events_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-static struct attribute *format_attrs[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-static struct attribute_group format_attr_group = {
-	.name = "format",
-	.attrs = format_attrs,
-};
-
-static const struct attribute_group *attr_groups[] = {
-	&events_attr_group,
-	&format_attr_group,
-	NULL,
-};
-
-static int msr_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	if (cfg >= PERF_MSR_EVENT_MAX)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	if (!msr[cfg].attr)
-		return -EINVAL;
-
-	event->hw.idx = -1;
-	event->hw.event_base = msr[cfg].msr;
-	event->hw.config = cfg;
-
-	return 0;
-}
-
-static inline u64 msr_read_counter(struct perf_event *event)
-{
-	u64 now;
-
-	if (event->hw.event_base)
-		rdmsrl(event->hw.event_base, now);
-	else
-		rdtscll(now);
-
-	return now;
-}
-static void msr_event_update(struct perf_event *event)
-{
-	u64 prev, now;
-	s64 delta;
-
-	/* Careful, an NMI might modify the previous event value. */
-again:
-	prev = local64_read(&event->hw.prev_count);
-	now = msr_read_counter(event);
-
-	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
-		goto again;
-
-	delta = now - prev;
-	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
-		delta = sign_extend64(delta, 31);
-
-	local64_add(now - prev, &event->count);
-}
-
-static void msr_event_start(struct perf_event *event, int flags)
-{
-	u64 now;
-
-	now = msr_read_counter(event);
-	local64_set(&event->hw.prev_count, now);
-}
-
-static void msr_event_stop(struct perf_event *event, int flags)
-{
-	msr_event_update(event);
-}
-
-static void msr_event_del(struct perf_event *event, int flags)
-{
-	msr_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int msr_event_add(struct perf_event *event, int flags)
-{
-	if (flags & PERF_EF_START)
-		msr_event_start(event, flags);
-
-	return 0;
-}
-
-static struct pmu pmu_msr = {
-	.task_ctx_nr	= perf_sw_context,
-	.attr_groups	= attr_groups,
-	.event_init	= msr_event_init,
-	.add		= msr_event_add,
-	.del		= msr_event_del,
-	.start		= msr_event_start,
-	.stop		= msr_event_stop,
-	.read		= msr_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static int __init msr_init(void)
-{
-	int i, j = 0;
-
-	if (!boot_cpu_has(X86_FEATURE_TSC)) {
-		pr_cont("no MSR PMU driver.\n");
-		return 0;
-	}
-
-	/* Probe the MSRs. */
-	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
-		u64 val;
-
-		/*
-		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-		 */
-		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-			msr[i].attr = NULL;
-	}
-
-	/* List remaining MSRs in the sysfs attrs. */
-	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
-		if (msr[i].attr)
-			events_attrs[j++] = &msr[i].attr->attr.attr;
-	}
-	events_attrs[j] = NULL;
-
-	perf_pmu_register(&pmu_msr, "msr", -1);
-
-	return 0;
-}
-device_initcall(msr_init);
-- 
cgit 


From 27f6d22b037b2be6685e0e27cce929779d634119 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:23 +0100
Subject: perf/x86: Move perf_event.h to its new home

Now that all functionality has been moved to arch/x86/events/, move the
perf_event.h header and adjust include paths.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-18-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/core.c       |   2 +-
 arch/x86/events/amd/ibs.c        |   2 +-
 arch/x86/events/amd/iommu.c      |   2 +-
 arch/x86/events/core.c           |   2 +-
 arch/x86/events/intel/bts.c      |   2 +-
 arch/x86/events/intel/core.c     |   2 +-
 arch/x86/events/intel/cqm.c      |   2 +-
 arch/x86/events/intel/cstate.c   |   2 +-
 arch/x86/events/intel/ds.c       |   2 +-
 arch/x86/events/intel/knc.c      |   2 +-
 arch/x86/events/intel/lbr.c      |   2 +-
 arch/x86/events/intel/p4.c       |   2 +-
 arch/x86/events/intel/p6.c       |   2 +-
 arch/x86/events/intel/pt.c       |   2 +-
 arch/x86/events/intel/rapl.c     |   2 +-
 arch/x86/events/intel/uncore.h   |   2 +-
 arch/x86/events/perf_event.h     | 955 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.h | 955 ---------------------------------------
 arch/x86/xen/pmu.c               |   2 +-
 19 files changed, 972 insertions(+), 972 deletions(-)
 create mode 100644 arch/x86/events/perf_event.h
 delete mode 100644 arch/x86/kernel/cpu/perf_event.h

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 51b16583679c..049ada8d4e9c 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -5,7 +5,7 @@
 #include <linux/slab.h>
 #include <asm/apicdef.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index a8abd082d932..51087c29b2c2 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -14,7 +14,7 @@
 
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static u32 ibs_caps;
 
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 629bc700eb08..635e5eba0caf 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -16,7 +16,7 @@
 #include <linux/cpumask.h>
 #include <linux/slab.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 #include "iommu.h"
 
 #define COUNTER_SHIFT		16
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 90ca601b5d5c..7402c8182813 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -37,7 +37,7 @@
 #include <asm/desc.h>
 #include <asm/ldt.h>
 
-#include "../kernel/cpu/perf_event.h"
+#include "perf_event.h"
 
 struct x86_pmu x86_pmu __read_mostly;
 
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 2bd4833367a7..b99dc9258c0f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -26,7 +26,7 @@
 #include <asm-generic/sizes.h>
 #include <asm/perf_event.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 struct bts_ctx {
 	struct perf_output_handle	handle;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1edf3013e528..a7ec685657a5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -18,7 +18,7 @@
 #include <asm/hardirq.h>
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * Intel PerfMon, used on Core and later.
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index d1b623a15ac0..1b064c430140 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,7 +7,7 @@
 #include <linux/perf_event.h>
 #include <linux/slab.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define MSR_IA32_PQR_ASSOC	0x0c8f
 #define MSR_IA32_QM_CTR		0x0c8e
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 1bbf37e8604b..7946c4231169 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -89,7 +89,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
 static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9677207a2ab1..c8a243d6fc82 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,7 +5,7 @@
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
index 630bcba91b16..206226e08f49 100644
--- a/arch/x86/events/intel/knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -5,7 +5,7 @@
 
 #include <asm/hardirq.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static const u64 knc_perfmon_event_map[] =
 {
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 78c88f9293cc..69dd11887dd1 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -5,7 +5,7 @@
 #include <asm/msr.h>
 #include <asm/insn.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 enum {
 	LBR_FORMAT_32		= 0x00,
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 1c72fedd96d3..0a5ede187d9c 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -13,7 +13,7 @@
 #include <asm/hardirq.h>
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define P4_CNTR_LIMIT 3
 /*
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index ee5c4e8bd556..1f5c47ab4c65 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -1,7 +1,7 @@
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * Not sure about some of these
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e56cebe07994..6af7cf71d6b2 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -29,7 +29,7 @@
 #include <asm/io.h>
 #include <asm/intel_pt.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 #include "pt.h"
 
 static DEFINE_PER_CPU(struct pt, pt_ctx);
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 9541f50a534a..580f504ec27c 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -48,7 +48,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * RAPL energy status counters
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 1dea2046990a..6a1340c7f3cc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -2,7 +2,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/perf_event.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define UNCORE_PMU_NAME_LEN		32
 #define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
new file mode 100644
index 000000000000..7bb61e32fb29
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,955 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
+
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
+	EXTRA_REG_LBR   = 2,	/* lbr_select */
+	EXTRA_REG_LDLAT = 3,	/* ld_lat_threshold */
+	EXTRA_REG_FE    = 4,    /* fe_* */
+
+	EXTRA_REG_MAX		/* number of entries needed */
+};
+
+struct event_constraint {
+	union {
+		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+		u64		idxmsk64;
+	};
+	u64	code;
+	u64	cmask;
+	int	weight;
+	int	overlap;
+	int	flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
+
+
+struct amd_nb {
+	int nb_id;  /* NorthBridge id */
+	int refcnt; /* reference count */
+	struct perf_event *owners[X86_PMC_IDX_MAX];
+	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+	PERF_SAMPLE_TRANSACTION)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64                 config;	/* extra MSR config */
+	u64                 reg;	/* extra MSR number */
+	atomic_t            ref;	/* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
+};
+
+enum intel_excl_state_type {
+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+	raw_spinlock_t	lock;
+
+	struct intel_excl_states states[2];
+
+	union {
+		u16	has_exclusive[2];
+		u32	exclusive_present;
+	};
+
+	int		refcnt;		/* per-core: #HT threads */
+	unsigned	core_id;	/* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES		32
+
+enum {
+	X86_PERF_KFREE_SHARED = 0,
+	X86_PERF_KFREE_EXCL   = 1,
+	X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	int			enabled;
+
+	int			n_events; /* the # of events in the below arrays */
+	int			n_added;  /* the # last events in the below arrays;
+					     they've never been enabled yet */
+	int			n_txn;    /* the # last events in the below arrays;
+					     added in the current transaction */
+	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+	u64			tags[X86_PMC_IDX_MAX];
+
+	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
+
+	int			n_excl; /* the number of exclusive events */
+
+	unsigned int		txn_flags;
+	int			is_fake;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+	struct er_account		*lbr_sel;
+	u64				br_sel;
+
+	/*
+	 * Intel host/guest exclude bits
+	 */
+	u64				intel_ctrl_guest_mask;
+	u64				intel_ctrl_host_mask;
+	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];
+
+	/*
+	 * Intel checkpoint mask
+	 */
+	u64				intel_cp_status;
+
+	/*
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
+	 */
+	struct intel_shared_regs	*shared_regs;
+	/*
+	 * manage exclusive counter access between hyperthread
+	 */
+	struct event_constraint *constraint_list; /* in enable order */
+	struct intel_excl_cntrs		*excl_cntrs;
+	int excl_thread_id; /* 0 or 1 */
+
+	/*
+	 * AMD specific bits
+	 */
+	struct amd_nb			*amd_nb;
+	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+	u64				perf_ctr_virt_mask;
+
+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+	{ .idxmsk64 = (n) },		\
+	.code = (c),			\
+	.cmask = (m),			\
+	.weight = (w),			\
+	.overlap = (o),			\
+	.flags = f,			\
+}
+
+#define EVENT_CONSTRAINT(c, n, m)	\
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+			   0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  - in_tx
+ *  - in_tx_checkpointed
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
+	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c)	\
+	for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+	unsigned int		event;
+	unsigned int		msr;
+	u64			config_mask;
+	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
+	bool			extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
+	.event = (e),			\
+	.msr = (ms),			\
+	.config_mask = (m),		\
+	.valid_mask = (vm),		\
+	.idx = EXTRA_REG_##i,		\
+	.extra_msr_access = true,	\
+	}
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+			ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+	INTEL_UEVENT_EXTRA_REG(c, \
+			       MSR_PEBS_LD_LAT_THRESHOLD, \
+			       0xffff, \
+			       LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+	struct {
+		u64	lbr_format:6;
+		u64	pebs_trap:1;
+		u64	pebs_arch_reg:1;
+		u64	pebs_format:4;
+		u64	smm_freeze:1;
+		/*
+		 * PMU supports separate counter range for writing
+		 * values > 32bit.
+		 */
+		u64	full_width_write:1;
+	};
+	u64	capabilities;
+};
+
+struct x86_pmu_quirk {
+	struct x86_pmu_quirk *next;
+	void (*func)(void);
+};
+
+union x86_pmu_config {
+	struct {
+		u64 event:8,
+		    umask:8,
+		    usr:1,
+		    os:1,
+		    edge:1,
+		    pc:1,
+		    interrupt:1,
+		    __reserved1:1,
+		    en:1,
+		    inv:1,
+		    cmask:8,
+		    event2:4,
+		    __reserved2:4,
+		    go:1,
+		    ho:1;
+	} bits;
+	u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_bts,
+	x86_lbr_exclusive_pt,
+	x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	const char	*name;
+	int		version;
+	int		(*handle_irq)(struct pt_regs *);
+	void		(*disable_all)(void);
+	void		(*enable_all)(int added);
+	void		(*enable)(struct perf_event *);
+	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event *event);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*addr_offset)(int index, bool eventsel);
+	int		(*rdpmc_index)(int index);
+	u64		(*event_map)(int);
+	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
+	union {
+			unsigned long events_maskl;
+			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+	};
+	int		events_mask_len;
+	int		apic;
+	u64		max_period;
+	struct event_constraint *
+			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 int idx,
+						 struct perf_event *event);
+
+	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
+						 struct perf_event *event);
+
+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
+
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+	struct event_constraint *event_constraints;
+	struct x86_pmu_quirk *quirks;
+	int		perfctr_second_write;
+	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
+
+	/*
+	 * sysfs attrs
+	 */
+	int		attr_rdpmc_broken;
+	int		attr_rdpmc;
+	struct attribute **format_attrs;
+	struct attribute **event_attrs;
+
+	ssize_t		(*events_sysfs_show)(char *page, u64 config);
+	struct attribute **cpu_events;
+
+	/*
+	 * CPU Hotplug hooks
+	 */
+	int		(*cpu_prepare)(int cpu);
+	void		(*cpu_starting)(int cpu);
+	void		(*cpu_dying)(int cpu);
+	void		(*cpu_dead)(int cpu);
+
+	void		(*check_microcode)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	unsigned int	bts		:1,
+			bts_active	:1,
+			pebs		:1,
+			pebs_active	:1,
+			pebs_broken	:1,
+			pebs_prec_dist	:1;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
+	void		(*pebs_aliases)(struct perf_event *event);
+	int 		max_pebs_events;
+	unsigned long	free_running_flags;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
+	const int	*lbr_sel_map;		   /* lbr_select mappings */
+	bool		lbr_double_abort;	   /* duplicated lbr aborts */
+
+	/*
+	 * Intel PT/LBR/BTS are exclusive
+	 */
+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
+
+	/*
+	 * Extra registers for events
+	 */
+	struct extra_reg *extra_regs;
+	unsigned int flags;
+
+	/*
+	 * Intel host/guest support (KVM)
+	 */
+	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+struct x86_perf_task_context {
+	u64 lbr_from[MAX_LBR_ENTRIES];
+	u64 lbr_to[MAX_LBR_ENTRIES];
+	u64 lbr_info[MAX_LBR_ENTRIES];
+	int tos;
+	int lbr_callstack_users;
+	int lbr_stack_state;
+};
+
+#define x86_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = x86_pmu.quirks;					\
+	x86_pmu.quirks = &__quirk;					\
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
+
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)						\
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= PERF_COUNT_HW_##_id,				\
+	.event_str	= NULL,						\
+};
+
+#define EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= 0,						\
+	.event_str	= str,						\
+};
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+	return  x86_pmu.lbr_sel_map &&
+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+				   x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+	return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+				  x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
+{
+	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+	regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+	if (regs->flags & X86_VM_MASK)
+		regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+	regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !event->attr.freq && event->hw.sample_period == 1)
+		return true;
+
+	return false;
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page);
+
+static inline int is_ht_workaround_enabled(void)
+{
+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+	return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	return NULL;
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+	return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
deleted file mode 100644
index 7bb61e32fb29..000000000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-/* To enable MSR tracing please use the generic trace points. */
-
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-	EXTRA_REG_NONE  = -1,	/* not used */
-
-	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
-	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
-	EXTRA_REG_LBR   = 2,	/* lbr_select */
-	EXTRA_REG_LDLAT = 3,	/* ld_lat_threshold */
-	EXTRA_REG_FE    = 4,    /* fe_* */
-
-	EXTRA_REG_MAX		/* number of entries needed */
-};
-
-struct event_constraint {
-	union {
-		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-		u64		idxmsk64;
-	};
-	u64	code;
-	u64	cmask;
-	int	weight;
-	int	overlap;
-	int	flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
-
-
-struct amd_nb {
-	int nb_id;  /* NorthBridge id */
-	int refcnt; /* reference count */
-	struct perf_event *owners[X86_PMC_IDX_MAX];
-	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
-
-/*
- * Flags PEBS can handle without an PMI.
- *
- * TID can only be handled by flushing at context switch.
- *
- */
-#define PEBS_FREERUNNING_FLAGS \
-	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
-	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
-	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
-	raw_spinlock_t		lock;	/* per-core: protect structure */
-	u64                 config;	/* extra MSR config */
-	u64                 reg;	/* extra MSR number */
-	atomic_t            ref;	/* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
-	struct er_account       regs[EXTRA_REG_MAX];
-	int                     refcnt;		/* per-core: #HT threads */
-	unsigned                core_id;	/* per-core: core id */
-};
-
-enum intel_excl_state_type {
-	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
-	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
-	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
-};
-
-struct intel_excl_states {
-	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-	bool sched_started; /* true if scheduling has started */
-};
-
-struct intel_excl_cntrs {
-	raw_spinlock_t	lock;
-
-	struct intel_excl_states states[2];
-
-	union {
-		u16	has_exclusive[2];
-		u32	exclusive_present;
-	};
-
-	int		refcnt;		/* per-core: #HT threads */
-	unsigned	core_id;	/* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES		32
-
-enum {
-	X86_PERF_KFREE_SHARED = 0,
-	X86_PERF_KFREE_EXCL   = 1,
-	X86_PERF_KFREE_MAX
-};
-
-struct cpu_hw_events {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	int			enabled;
-
-	int			n_events; /* the # of events in the below arrays */
-	int			n_added;  /* the # last events in the below arrays;
-					     they've never been enabled yet */
-	int			n_txn;    /* the # last events in the below arrays;
-					     added in the current transaction */
-	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-	u64			tags[X86_PMC_IDX_MAX];
-
-	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
-
-	int			n_excl; /* the number of exclusive events */
-
-	unsigned int		txn_flags;
-	int			is_fake;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	struct debug_store	*ds;
-	u64			pebs_enabled;
-
-	/*
-	 * Intel LBR bits
-	 */
-	int				lbr_users;
-	void				*lbr_context;
-	struct perf_branch_stack	lbr_stack;
-	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-	struct er_account		*lbr_sel;
-	u64				br_sel;
-
-	/*
-	 * Intel host/guest exclude bits
-	 */
-	u64				intel_ctrl_guest_mask;
-	u64				intel_ctrl_host_mask;
-	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];
-
-	/*
-	 * Intel checkpoint mask
-	 */
-	u64				intel_cp_status;
-
-	/*
-	 * manage shared (per-core, per-cpu) registers
-	 * used on Intel NHM/WSM/SNB
-	 */
-	struct intel_shared_regs	*shared_regs;
-	/*
-	 * manage exclusive counter access between hyperthread
-	 */
-	struct event_constraint *constraint_list; /* in enable order */
-	struct intel_excl_cntrs		*excl_cntrs;
-	int excl_thread_id; /* 0 or 1 */
-
-	/*
-	 * AMD specific bits
-	 */
-	struct amd_nb			*amd_nb;
-	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
-	u64				perf_ctr_virt_mask;
-
-	void				*kfree_on_online[X86_PERF_KFREE_MAX];
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
-	{ .idxmsk64 = (n) },		\
-	.code = (c),			\
-	.cmask = (m),			\
-	.weight = (w),			\
-	.overlap = (o),			\
-	.flags = f,			\
-}
-
-#define EVENT_CONSTRAINT(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
-			   0, PERF_X86_EVENT_EXCL)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  - in_tx
- *  - in_tx_checkpointed
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-/* Constraint on specific umask bit only + event */
-#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
-
-/* Like UEVENT_CONSTRAINT, but match flags too */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
-			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
-
-#define INTEL_PLD_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* Event constraint, but match on all event flags too. */
-#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-/* Check only flags, but allow all event/umask */
-#define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
-	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
-
-/* Check flags and event code, and set the HSW store flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-/* Check flags and event code, and set the HSW load flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW store flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW load flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW N/A flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
-
-
-/*
- * We define the end marker as having a weight of -1
- * to enable blacklisting of events using a counter bitmask
- * of zero and thus a weight of zero.
- * The end marker has a weight that cannot possibly be
- * obtained from counting the bits in the bitmask.
- */
-#define EVENT_CONSTRAINT_END { .weight = -1 }
-
-/*
- * Check for end marker with weight == -1
- */
-#define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->weight != -1; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-	unsigned int		event;
-	unsigned int		msr;
-	u64			config_mask;
-	u64			valid_mask;
-	int			idx;  /* per_xxx->regs[] reg index */
-	bool			extra_msr_access;
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
-	.event = (e),			\
-	.msr = (ms),			\
-	.config_mask = (m),		\
-	.valid_mask = (vm),		\
-	.idx = EXTRA_REG_##i,		\
-	.extra_msr_access = true,	\
-	}
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
-			ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
-	INTEL_UEVENT_EXTRA_REG(c, \
-			       MSR_PEBS_LD_LAT_THRESHOLD, \
-			       0xffff, \
-			       LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-	struct {
-		u64	lbr_format:6;
-		u64	pebs_trap:1;
-		u64	pebs_arch_reg:1;
-		u64	pebs_format:4;
-		u64	smm_freeze:1;
-		/*
-		 * PMU supports separate counter range for writing
-		 * values > 32bit.
-		 */
-		u64	full_width_write:1;
-	};
-	u64	capabilities;
-};
-
-struct x86_pmu_quirk {
-	struct x86_pmu_quirk *next;
-	void (*func)(void);
-};
-
-union x86_pmu_config {
-	struct {
-		u64 event:8,
-		    umask:8,
-		    usr:1,
-		    os:1,
-		    edge:1,
-		    pc:1,
-		    interrupt:1,
-		    __reserved1:1,
-		    en:1,
-		    inv:1,
-		    cmask:8,
-		    event2:4,
-		    __reserved2:4,
-		    go:1,
-		    ho:1;
-	} bits;
-	u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-enum {
-	x86_lbr_exclusive_lbr,
-	x86_lbr_exclusive_bts,
-	x86_lbr_exclusive_pt,
-	x86_lbr_exclusive_max,
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	const char	*name;
-	int		version;
-	int		(*handle_irq)(struct pt_regs *);
-	void		(*disable_all)(void);
-	void		(*enable_all)(int added);
-	void		(*enable)(struct perf_event *);
-	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event *event);
-	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-	unsigned	eventsel;
-	unsigned	perfctr;
-	int		(*addr_offset)(int index, bool eventsel);
-	int		(*rdpmc_index)(int index);
-	u64		(*event_map)(int);
-	int		max_events;
-	int		num_counters;
-	int		num_counters_fixed;
-	int		cntval_bits;
-	u64		cntval_mask;
-	union {
-			unsigned long events_maskl;
-			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
-	};
-	int		events_mask_len;
-	int		apic;
-	u64		max_period;
-	struct event_constraint *
-			(*get_event_constraints)(struct cpu_hw_events *cpuc,
-						 int idx,
-						 struct perf_event *event);
-
-	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
-						 struct perf_event *event);
-
-	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
-
-	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
-	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
-
-	struct event_constraint *event_constraints;
-	struct x86_pmu_quirk *quirks;
-	int		perfctr_second_write;
-	bool		late_ack;
-	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
-
-	/*
-	 * sysfs attrs
-	 */
-	int		attr_rdpmc_broken;
-	int		attr_rdpmc;
-	struct attribute **format_attrs;
-	struct attribute **event_attrs;
-
-	ssize_t		(*events_sysfs_show)(char *page, u64 config);
-	struct attribute **cpu_events;
-
-	/*
-	 * CPU Hotplug hooks
-	 */
-	int		(*cpu_prepare)(int cpu);
-	void		(*cpu_starting)(int cpu);
-	void		(*cpu_dying)(int cpu);
-	void		(*cpu_dead)(int cpu);
-
-	void		(*check_microcode)(void);
-	void		(*sched_task)(struct perf_event_context *ctx,
-				      bool sched_in);
-
-	/*
-	 * Intel Arch Perfmon v2+
-	 */
-	u64			intel_ctrl;
-	union perf_capabilities intel_cap;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	unsigned int	bts		:1,
-			bts_active	:1,
-			pebs		:1,
-			pebs_active	:1,
-			pebs_broken	:1,
-			pebs_prec_dist	:1;
-	int		pebs_record_size;
-	void		(*drain_pebs)(struct pt_regs *regs);
-	struct event_constraint *pebs_constraints;
-	void		(*pebs_aliases)(struct perf_event *event);
-	int 		max_pebs_events;
-	unsigned long	free_running_flags;
-
-	/*
-	 * Intel LBR
-	 */
-	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-	int		lbr_nr;			   /* hardware stack size */
-	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
-	const int	*lbr_sel_map;		   /* lbr_select mappings */
-	bool		lbr_double_abort;	   /* duplicated lbr aborts */
-
-	/*
-	 * Intel PT/LBR/BTS are exclusive
-	 */
-	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
-
-	/*
-	 * Extra registers for events
-	 */
-	struct extra_reg *extra_regs;
-	unsigned int flags;
-
-	/*
-	 * Intel host/guest support (KVM)
-	 */
-	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-struct x86_perf_task_context {
-	u64 lbr_from[MAX_LBR_ENTRIES];
-	u64 lbr_to[MAX_LBR_ENTRIES];
-	u64 lbr_info[MAX_LBR_ENTRIES];
-	int tos;
-	int lbr_callstack_users;
-	int lbr_stack_state;
-};
-
-#define x86_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = x86_pmu.quirks;					\
-	x86_pmu.quirks = &__quirk;					\
-} while (0)
-
-/*
- * x86_pmu flags
- */
-#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
-#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
-#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
-#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
-
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id)						\
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {			\
-	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
-	.id		= PERF_COUNT_HW_##_id,				\
-	.event_str	= NULL,						\
-};
-
-#define EVENT_ATTR_STR(_name, v, str)					\
-static struct perf_pmu_events_attr event_attr_##v = {			\
-	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
-	.id		= 0,						\
-	.event_str	= str,						\
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-static inline bool x86_pmu_has_lbr_callstack(void)
-{
-	return  x86_pmu.lbr_sel_map &&
-		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
-}
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
-				   x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-	return x86_pmu.perfctr + (x86_pmu.addr_offset ?
-				  x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
-	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_add_exclusive(unsigned int what);
-
-void x86_del_exclusive(unsigned int what);
-
-int x86_reserve_hardware(void);
-
-void x86_release_hardware(void);
-
-void hw_perf_lbr_event_destroy(struct perf_event *event);
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-					  u64 enable_mask)
-{
-	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
-	if (hwc->extra_reg.reg)
-		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-	return ip > PAGE_OFFSET;
-#else
-	return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
-	regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
-	if (regs->flags & X86_VM_MASK)
-		regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
-	regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-struct attribute **merge_attr(struct attribute **a, struct attribute **b);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
-	return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-static inline bool intel_pmu_has_bts(struct perf_event *event)
-{
-	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-	    !event->attr.freq && event->hw.sample_period == 1)
-		return true;
-
-	return false;
-}
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_slm_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-extern struct event_constraint intel_skl_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(bool pmi);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-void intel_pmu_lbr_init_hsw(void);
-
-void intel_pmu_lbr_init_skl(void);
-
-void intel_pmu_lbr_init_knl(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-void intel_pt_interrupt(void);
-
-int intel_bts_interrupt(void);
-
-void intel_bts_enable_local(void);
-
-void intel_bts_disable_local(void);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-			  char *page);
-
-static inline int is_ht_workaround_enabled(void)
-{
-	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
-	return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-	return NULL;
-}
-
-static inline int is_ht_workaround_enabled(void)
-{
-	return 0;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 724a08740a04..9466354d3e49 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -11,7 +11,7 @@
 #include "pmu.h"
 
 /* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
 
 #define XENPMU_IRQ_PROCESSING    1
 struct xenpmu {
-- 
cgit 


From 8f8e2aec9944dd12671182a1a26b8e1a35872a1d Mon Sep 17 00:00:00 2001
From: Alan <gnomes@lxorguk.ukuu.org.uk>
Date: Wed, 17 Feb 2016 14:10:15 +0000
Subject: x86/platform/intel/mid: Remove dead code

Neither ratio nor fsb are ever zero, so remove the 0 case.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/mfld.c | 5 +----
 arch/x86/platform/intel-mid/mrfl.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 23381d2174ae..1eb47b6298c2 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void)
 	/* mark tsc clocksource as reliable */
 	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
+	return fast_calibrate;
 }
 
 static void __init penwell_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
index aaca91753d32..bd1adc621781 100644
--- a/arch/x86/platform/intel-mid/mrfl.c
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
 	/* mark tsc clocksource as reliable */
 	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
+	return fast_calibrate;
 }
 
 static void __init tangier_arch_setup(void)
-- 
cgit 


From fe25d078874f2c29c38f4160467d74f5756537c9 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 17 Feb 2016 12:26:33 -0600
Subject: Revert "x86/PCI: Don't alloc pcibios-irq when MSI is enabled"

Revert 8affb487d4a4 ("x86/PCI: Don't alloc pcibios-irq when MSI is
enabled").

This is part of reverting 991de2e59090 ("PCI, x86: Implement
pcibios_alloc_irq() and pcibios_free_irq()") to fix regressions it
introduced.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=111211
Fixes: 991de2e59090 ("PCI, x86: Implement pcibios_alloc_irq() and pcibios_free_irq()")
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
CC: Jiang Liu <jiang.liu@linux.intel.com>
CC: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/pci/common.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2879efc73a96..cb499c5b6ae5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -713,14 +713,6 @@ int pcibios_add_device(struct pci_dev *dev)
 
 int pcibios_alloc_irq(struct pci_dev *dev)
 {
-	/*
-	 * If the PCI device was already claimed by core code and has
-	 * MSI enabled, probing of the pcibios IRQ will overwrite
-	 * dev->irq.  So bail out if MSI is already enabled.
-	 */
-	if (pci_dev_msi_enabled(dev))
-		return -EBUSY;
-
 	return pcibios_enable_irq(dev);
 }
 
-- 
cgit 


From 67b4eab91caf2ad574cab1b17ae09180ea2e116e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 17 Feb 2016 12:26:38 -0600
Subject: Revert "PCI: Add helpers to manage pci_dev->irq and
 pci_dev->irq_managed"

Revert 811a4e6fce09 ("PCI: Add helpers to manage pci_dev->irq and
pci_dev->irq_managed").

This is part of reverting 991de2e59090 ("PCI, x86: Implement
pcibios_alloc_irq() and pcibios_free_irq()") to fix regressions it
introduced.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=111211
Fixes: 991de2e59090 ("PCI, x86: Implement pcibios_alloc_irq() and pcibios_free_irq()")
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
CC: Jiang Liu <jiang.liu@linux.intel.com>
---
 arch/x86/pci/intel_mid_pci.c |  4 ++--
 arch/x86/pci/irq.c           | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 0d24e7c10145..8826ff593ebc 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 	int polarity;
 	int ret;
 
-	if (pci_has_managed_irq(dev))
+	if (dev->irq_managed && dev->irq > 0)
 		return 0;
 
 	switch (intel_mid_identify_cpu()) {
@@ -256,7 +256,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (pci_has_managed_irq(dev)) {
+	if (dev->irq_managed && dev->irq > 0) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
 		/*
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 32e70343e6fd..72108f0b66b1 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			struct pci_dev *temp_dev;
 			int irq;
 
-			if (pci_has_managed_irq(dev))
+			if (dev->irq_managed && dev->irq > 0)
 				return 0;
 
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1230,7 +1230,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				pci_set_managed_irq(dev, irq);
+				dev->irq_managed = 1;
+				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
 				return 0;
@@ -1258,8 +1259,9 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
+	if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
 		mp_unmap_irq(dev->irq);
-		pci_reset_managed_irq(dev);
+		dev->irq = 0;
+		dev->irq_managed = 0;
 	}
 }
-- 
cgit 


From 548acf19234dbda5a52d5a8e7e205af46e9da840 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:12 -0800
Subject: x86/mm: Expand the exception table logic to allow new handling
 options

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:

  ' I'd rather not be clever in order to save just a tiny amount of space
    in the exception table, which isn't really criticial for anybody. '

The third field is another relative function pointer, this one to a
handler that executes the actions.

We start out with three handlers:

 1: Legacy - just jumps the to fixup IP
 2: Fault - provide the trap number in %ax to the fixup code
 3: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f6af78fcbd348cf4939875cfda9c19689b5e50b8.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/asm.h     |  40 ++++++++++-------
 arch/x86/include/asm/uaccess.h |  16 +++----
 arch/x86/kernel/kprobes/core.c |   2 +-
 arch/x86/kernel/traps.c        |   6 +--
 arch/x86/mm/extable.c          | 100 +++++++++++++++++++++++++++++------------
 arch/x86/mm/fault.c            |   2 +-
 6 files changed, 109 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..f5063b6659eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +92,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..c0f27d7ea7ff 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..0f05deeff5ce 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a46b1d..211c11c7bba4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e9c326..9dd7e4b7fcde 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
+
+	e = search_exception_tables(ip);
+	if (!e)
+		return false;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (handler != ex_handler_default)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..495946c3f9dd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
-- 
cgit 


From b2f9d678e28ca71ce650eac82f26dd287b47e89a Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:13 -0800
Subject: x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
 table entries

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 +++++++++-
 arch/x86/kernel/cpu/mcheck/mce.c          | 70 ++++++++++++++++---------------
 2 files changed, 56 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c222071..5119766d9889 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
 		PANIC, "In kernel and no restart IP",
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
+	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
 	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
+	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
 	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b7180801ea33..524f2a8492d7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
cgit 


From 0f68c088c0adb3c3bbeb487c4ebcde91fd5d34be Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:13 -0800
Subject: x86/cpufeature: Create a new synthetic cpu capability for machine
 check recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Intel Software Developer Manual describes bit 24 in the MCG_CAP
MSR:

   MCG_SER_P (software error recovery support present) flag,
   bit 24 — Indicates (when set) that the processor supports
   software error recovery

But only some models with this capability bit set will actually
generate recoverable machine checks.

Check the model name and set a synthetic capability bit. Provide
a command line option to set this bit anyway in case the kernel
doesn't recognise the model name.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2e5bfb23c89800a036fb8a45fa97a74bb16bc362.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/mce.h         |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c   | 13 +++++++++++++
 3 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0ceb6adc8a48..6663fae71b12 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -106,6 +106,7 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527e462f..18d2ba9c8e44 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -113,6 +113,7 @@ struct mca_config {
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
+	bool recovery;
 	bool bios_cmci_threshold;
 	u8 banks;
 	s8 bootlog;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4cd792b..b5b187c8cc07 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1576,6 +1576,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
 		if (c->x86 == 6 && c->x86_model == 45)
 			quirk_no_way_out = quirk_sandybridge_ifu;
+		/*
+		 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+		 * whether this processor will actually generate recoverable
+		 * machine checks. Check to see if this is an E7 model Xeon.
+		 * We can't do a model number check because E5 and E7 use the
+		 * same model number. E5 doesn't support recovery, E7 does.
+		 */
+		if (mca_cfg.recovery || (mca_cfg.ser &&
+			!strncmp(c->x86_model_id,
+				 "Intel(R) Xeon(R) CPU E7-", 24)))
+			set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
 	}
 	if (cfg->monarch_timeout < 0)
 		cfg->monarch_timeout = 0;
@@ -2028,6 +2039,8 @@ static int __init mcheck_enable(char *str)
 		cfg->bootlog = (str[0] == 'b');
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
+	else if (!strcmp(str, "recovery"))
+		cfg->recovery = true;
 	else if (isdigit(str[0])) {
 		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-- 
cgit 


From 5c1d90f51027e197e1299ab1235a2fed78910905 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:05 -0800
Subject: x86/mm/pkeys: Add PTE bits for storing protection key

Previous documentation has referred to these 4 bits as "ignored".
That means that software could have made use of them.  But, as
far as I know, the kernel never used them.

They are still ignored when protection keys is not enabled, so
they could theoretically still get used for software purposes.

We also implement "empty" versions so that code that references
to them can be optimized away by the compiler when the config
option is not enabled.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210205.81E33ED6@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable_types.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4432ab7f407c..cae10ba3c975 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -20,13 +20,18 @@
 #define _PAGE_BIT_SOFTW2	10	/* " */
 #define _PAGE_BIT_SOFTW3	11	/* " */
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW4	58	/* available for programmer */
+#define _PAGE_BIT_PKEY_BIT0	59	/* Protection Keys, bit 1/4 */
+#define _PAGE_BIT_PKEY_BIT1	60	/* Protection Keys, bit 2/4 */
+#define _PAGE_BIT_PKEY_BIT2	61	/* Protection Keys, bit 3/4 */
+#define _PAGE_BIT_PKEY_BIT3	62	/* Protection Keys, bit 4/4 */
+#define _PAGE_BIT_NX		63	/* No execute: only valid after cpuid check */
+
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_SOFTW4	58	/* available for programmer */
-#define _PAGE_BIT_DEVMAP		_PAGE_BIT_SOFTW4
-#define _PAGE_BIT_NX		63	/* No execute: only valid after cpuid check */
+#define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -47,6 +52,17 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define _PAGE_PKEY_BIT0	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
+#define _PAGE_PKEY_BIT1	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+#define _PAGE_PKEY_BIT2	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
+#define _PAGE_PKEY_BIT3	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
+#else
+#define _PAGE_PKEY_BIT0	(_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT1	(_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT2	(_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT3	(_AT(pteval_t, 0))
+#endif
 #define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_KMEMCHECK
-- 
cgit 


From b3ecd51559ae7a8f40b10443773b9cd0e6a50f5e Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:07 -0800
Subject: x86/mm/pkeys: Add new 'PF_PK' page fault error code bit

Note: "PK" is how the Intel SDM refers to this bit, so we also
use that nomenclature.

This only defines the bit, it does not plumb it anywhere to be
handled.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210207.DA7B43E6@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9a3f77..9f72f9c9be4f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -33,6 +33,7 @@
  *   bit 2 ==	 0: kernel-mode access	1: user-mode access
  *   bit 3 ==				1: use of reserved bit detected
  *   bit 4 ==				1: fault was an instruction fetch
+ *   bit 5 ==				1: protection keys block access
  */
 enum x86_pf_error_code {
 
@@ -41,6 +42,7 @@ enum x86_pf_error_code {
 	PF_USER		=		1 << 2,
 	PF_RSVD		=		1 << 3,
 	PF_INSTR	=		1 << 4,
+	PF_PK		=		1 << 5,
 };
 
 /*
@@ -916,6 +918,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 
 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
 		return 0;
+	/*
+	 * Note: We do not do lazy flushing on protection key
+	 * changes, so no spurious fault will ever set PF_PK.
+	 */
+	if ((error_code & PF_PK))
+		return 1;
 
 	return 1;
 }
-- 
cgit 


From 63c17fb8e5a46a16e10e82005748837fd11a2024 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:08 -0800
Subject: mm/core, x86/mm/pkeys: Store protection bits in high VMA flags

vma->vm_flags is an 'unsigned long', so has space for 32 flags
on 32-bit architectures.  The high 32 bits are unused on 64-bit
platforms.  We've steered away from using the unused high VMA
bits for things because we would have difficulty supporting it
on 32-bit.

Protection Keys are not available in 32-bit mode, so there is
no concern about supporting this feature in 32-bit mode or on
32-bit CPUs.

This patch carves out 4 bits from the high half of
vma->vm_flags and allows architectures to set config option
to make them available.

Sparse complains about these constants unless we explicitly
call them "UL".

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210208.81AF00D5@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3632cdd03201..fb2ebeb9a692 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
 	select VIRT_TO_BUS
 	select X86_DEV_DMA_OPS			if X86_64
 	select X86_FEATURE_NAMES		if PROC_FS
+	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
 
 config INSTRUCTION_DECODER
 	def_bool y
-- 
cgit 


From 8f62c883222c9e3c06d60b5e55e307a3d1f18257 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:10 -0800
Subject: x86/mm/pkeys: Add arch-specific VMA protection bits

Lots of things seem to do:

        vma->vm_page_prot = vm_get_page_prot(flags);

and the ptes get created right from things we pull out
of ->vm_page_prot.  So it is very convenient if we can
store the protection key in flags and vm_page_prot, just
like the existing permission bits (_PAGE_RW/PRESENT).  It
greatly reduces the amount of plumbing and arch-specific
hacking we have to do in generic code.

This also takes the new PROT_PKEY{0,1,2,3} flags and
turns *those* in to VM_ flags for vma->vm_flags.

The protection key values are stored in 4 places:
	1. "prot" argument to system calls
	2. vma->vm_flags, filled from the mmap "prot"
	3. vma->vm_page prot, filled from vma->vm_flags
	4. the PTE itself.

The pseudocode for these for steps are as follows:

	mmap(PROT_PKEY*)
	vma->vm_flags 	  = ... | arch_calc_vm_prot_bits(mmap_prot);
	vma->vm_page_prot = ... | arch_vm_get_page_prot(vma->vm_flags);
	pte = pfn | vma->vm_page_prot

Note that this provides a new definitions for x86:

	arch_vm_get_page_prot()

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210210.FE483A42@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h   | 11 +++++++++++
 arch/x86/include/asm/pgtable_types.h | 12 ++++++++++--
 arch/x86/include/uapi/asm/mman.h     | 16 ++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bfd9b2a35a0b..94c4c8b5cb8f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -275,4 +275,15 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		mpx_notify_unmap(mm, vma, start, end);
 }
 
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+	u16 pkey = 0;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+				      VM_PKEY_BIT2 | VM_PKEY_BIT3;
+	pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+#endif
+	return pkey;
+}
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index cae10ba3c975..8c35cf0cc2ef 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -115,7 +115,12 @@
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
 			 _PAGE_DIRTY)
 
-/* Set of bits not changed in pte_modify */
+/*
+ * Set of bits not changed in pte_modify.  The pte's
+ * protection key is treated like _PAGE_RW, for
+ * instance, and is *not* included in this mask since
+ * pte_modify() does modify it.
+ */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
 			 _PAGE_SOFT_DIRTY)
@@ -231,7 +236,10 @@ enum page_cache_mode {
 /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
 #define PTE_PFN_MASK		((pteval_t)PHYSICAL_PAGE_MASK)
 
-/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
+/*
+ *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
+ *  This includes the protection key value.
+ */
 #define PTE_FLAGS_MASK		(~PTE_PFN_MASK)
 
 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 513b05f15bb4..e8562e0a2993 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -6,6 +6,22 @@
 #define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags)	__pgprot(	\
+		((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) |	\
+		((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) |	\
+		((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) |	\
+		((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+#endif
+
 #include <asm-generic/mman.h>
 
 #endif /* _ASM_X86_MMAN_H */
-- 
cgit 


From 7b2d0dbac4890c8ca4a8acc57709639fc8b158e9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:11 -0800
Subject: x86/mm/pkeys: Pass VMA down in to fault signal generation code

During a page fault, we look up the VMA to ensure that the fault
is in a region with a valid mapping.  But, in the top-level page
fault code we don't need the VMA for much else.  Once we have
decided that an access is bad, we are going to send a signal no
matter what and do not need the VMA any more.  So we do not pass
it down in to the signal generation code.

But, for protection keys, we need the VMA.  It tells us *which*
protection key we violated if we get a PF_PK.  So, we need to
pass the VMA down and fill in siginfo->si_pkey.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210211.AD3B36A3@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72f9c9be4f..3c51c66b65fc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -171,7 +171,8 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk, int fault)
+		     struct task_struct *tsk, struct vm_area_struct *vma,
+		     int fault)
 {
 	unsigned lsb = 0;
 	siginfo_t info;
@@ -656,6 +657,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 	unsigned long flags;
 	int sig;
+	/* No context means no VMA to pass down */
+	struct vm_area_struct *vma = NULL;
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs)) {
@@ -679,7 +682,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 			tsk->thread.cr2 = address;
 
 			/* XXX: hwpoison faults will set the wrong code. */
-			force_sig_info_fault(signal, si_code, address, tsk, 0);
+			force_sig_info_fault(signal, si_code, address,
+					     tsk, vma, 0);
 		}
 
 		/*
@@ -756,7 +760,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		       unsigned long address, int si_code)
+		       unsigned long address, struct vm_area_struct *vma,
+		       int si_code)
 {
 	struct task_struct *tsk = current;
 
@@ -799,7 +804,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.error_code	= error_code;
 		tsk->thread.trap_nr	= X86_TRAP_PF;
 
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
 
 		return;
 	}
@@ -812,14 +817,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		     unsigned long address)
+		     unsigned long address, struct vm_area_struct *vma)
 {
-	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
 }
 
 static void
 __bad_area(struct pt_regs *regs, unsigned long error_code,
-	   unsigned long address, int si_code)
+	   unsigned long address,  struct vm_area_struct *vma, int si_code)
 {
 	struct mm_struct *mm = current->mm;
 
@@ -829,25 +834,25 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	 */
 	up_read(&mm->mmap_sem);
 
-	__bad_area_nosemaphore(regs, error_code, address, si_code);
+	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
 }
 
 static noinline void
 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
-	__bad_area(regs, error_code, address, SEGV_MAPERR);
+	__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
 }
 
 static noinline void
 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
-		      unsigned long address)
+		      unsigned long address, struct vm_area_struct *vma)
 {
-	__bad_area(regs, error_code, address, SEGV_ACCERR);
+	__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
 }
 
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
-	  unsigned int fault)
+	  struct vm_area_struct *vma, unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	int code = BUS_ADRERR;
@@ -874,12 +879,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
 }
 
 static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-	       unsigned long address, unsigned int fault)
+	       unsigned long address, struct vm_area_struct *vma,
+	       unsigned int fault)
 {
 	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
@@ -903,9 +909,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
-			do_sigbus(regs, error_code, address, fault);
+			do_sigbus(regs, error_code, address, vma, fault);
 		else if (fault & VM_FAULT_SIGSEGV)
-			bad_area_nosemaphore(regs, error_code, address);
+			bad_area_nosemaphore(regs, error_code, address, vma);
 		else
 			BUG();
 	}
@@ -1119,7 +1125,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		 * Don't take the mm semaphore here. If we fixup a prefetch
 		 * fault we could otherwise deadlock:
 		 */
-		bad_area_nosemaphore(regs, error_code, address);
+		bad_area_nosemaphore(regs, error_code, address, NULL);
 
 		return;
 	}
@@ -1132,7 +1138,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		pgtable_bad(regs, error_code, address);
 
 	if (unlikely(smap_violation(error_code, regs))) {
-		bad_area_nosemaphore(regs, error_code, address);
+		bad_area_nosemaphore(regs, error_code, address, NULL);
 		return;
 	}
 
@@ -1141,7 +1147,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * in a region with pagefaults disabled then we must not take the fault
 	 */
 	if (unlikely(faulthandler_disabled() || !mm)) {
-		bad_area_nosemaphore(regs, error_code, address);
+		bad_area_nosemaphore(regs, error_code, address, NULL);
 		return;
 	}
 
@@ -1185,7 +1191,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if ((error_code & PF_USER) == 0 &&
 		    !search_exception_tables(regs->ip)) {
-			bad_area_nosemaphore(regs, error_code, address);
+			bad_area_nosemaphore(regs, error_code, address, NULL);
 			return;
 		}
 retry:
@@ -1233,7 +1239,7 @@ retry:
 	 */
 good_area:
 	if (unlikely(access_error(error_code, vma))) {
-		bad_area_access_error(regs, error_code, address);
+		bad_area_access_error(regs, error_code, address, vma);
 		return;
 	}
 
@@ -1271,7 +1277,7 @@ good_area:
 
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, error_code, address, fault);
+		mm_fault_error(regs, error_code, address, vma, fault);
 		return;
 	}
 
-- 
cgit 


From 019132ff3daf36c97a4006655dfd00ee42f2b590 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:14 -0800
Subject: x86/mm/pkeys: Fill in pkey field in siginfo

This fills in the new siginfo field: si_pkey to indicate to
userspace which protection key was set on the PTE that we faulted
on.

Note though that *ALL* protection key faults have to be generated
by a valid, present PTE at some point.  But this code does no PTE
lookups which seeds odd.  The reason is that we take advantage of
the way we generate PTEs from VMAs.  All PTEs under a VMA share
some attributes.  For instance, they are _all_ either PROT_READ
*OR* PROT_NONE.  They also always share a protection key, so we
never have to walk the page tables; we just use the VMA.

Note that _pkey is a 64-bit value.  The current hardware only
supports 4-bit protection keys.  We do this because there is
_plenty_ of space in _sigfault and it is possible that future
processors would support more than 4 bits of protection keys.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210213.ABC488FA@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable_types.h |  5 +++
 arch/x86/mm/fault.c                  | 64 +++++++++++++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c35cf0cc2ef..7b5efe264eff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -65,6 +65,11 @@
 #endif
 #define __HAVE_ARCH_PTE_SPECIAL
 
+#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
+			 _PAGE_PKEY_BIT1 | \
+			 _PAGE_PKEY_BIT2 | \
+			 _PAGE_PKEY_BIT3)
+
 #ifdef CONFIG_KMEMCHECK
 #define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
 #else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3c51c66b65fc..6e71dcf699ab 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -15,12 +15,14 @@
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
 #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
+#include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vm86.h>			/* struct vm86			*/
+#include <asm/mmu_context.h>		/* vma_pkey()			*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -169,6 +171,56 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	return prefetch;
 }
 
+/*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE.  Userspace can figure out what PKRU was
+ * from the XSAVE state, and this function fills out a field in
+ * siginfo so userspace can discover which protection key was set
+ * on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler.  It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1   : set PKRU to deny access to pkey=4, touches page
+ * 3. T1   : faults...
+ * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1   : enters fault handler, takes mmap_sem, etc...
+ * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
+ *	     faulted on a pte with its pkey=4.
+ */
+static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+		struct vm_area_struct *vma)
+{
+	/* This is effectively an #ifdef */
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return;
+
+	/* Fault not from Protection Keys: nothing to do */
+	if (si_code != SEGV_PKUERR)
+		return;
+	/*
+	 * force_sig_info_fault() is called from a number of
+	 * contexts, some of which have a VMA and some of which
+	 * do not.  The PF_PK handing happens after we have a
+	 * valid VMA, so we should never reach this without a
+	 * valid VMA.
+	 */
+	if (!vma) {
+		WARN_ONCE(1, "PKU fault with no VMA passed in");
+		info->si_pkey = 0;
+		return;
+	}
+	/*
+	 * si_pkey should be thought of as a strong hint, but not
+	 * absolutely guranteed to be 100% accurate because of
+	 * the race explained above.
+	 */
+	info->si_pkey = vma_pkey(vma);
+}
+
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		     struct task_struct *tsk, struct vm_area_struct *vma,
@@ -187,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		lsb = PAGE_SHIFT;
 	info.si_addr_lsb = lsb;
 
+	fill_sig_info_pkey(si_code, &info, vma);
+
 	force_sig_info(si_signo, &info, tsk);
 }
 
@@ -847,7 +901,15 @@ static noinline void
 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 		      unsigned long address, struct vm_area_struct *vma)
 {
-	__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
+	/*
+	 * This OSPKE check is not strictly necessary at runtime.
+	 * But, doing it this way allows compiler optimizations
+	 * if pkeys are compiled out.
+	 */
+	if (boot_cpu_has(X86_FEATURE_OSPKE) && (error_code & PF_PK))
+		__bad_area(regs, error_code, address, vma, SEGV_PKUERR);
+	else
+		__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
 }
 
 static void
-- 
cgit 


From a927cb83f3300bcb1ae18672e58029acddd18b33 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:15 -0800
Subject: x86/mm/pkeys: Add functions to fetch PKRU

This adds the raw instruction to access PKRU as well as some
accessor functions that correctly handle when the CPU does not
support the instruction.  We don't use it here, but we will use
read_pkru() in the next patch.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210215.15238D34@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0687c4748b8f..e997dcc6ee2b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -99,6 +99,14 @@ static inline int pte_dirty(pte_t pte)
 	return pte_flags(pte) & _PAGE_DIRTY;
 }
 
+
+static inline u32 read_pkru(void)
+{
+	if (boot_cpu_has(X86_FEATURE_OSPKE))
+		return __read_pkru();
+	return 0;
+}
+
 static inline int pte_young(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_ACCESSED;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2270e41b32fd..aee6e76e561e 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -98,6 +98,28 @@ static inline void native_write_cr8(unsigned long val)
 }
 #endif
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static inline u32 __read_pkru(void)
+{
+	u32 ecx = 0;
+	u32 edx, pkru;
+
+	/*
+	 * "rdpkru" instruction.  Places PKRU contents in to EAX,
+	 * clears EDX and requires that ecx=0.
+	 */
+	asm volatile(".byte 0x0f,0x01,0xee\n\t"
+		     : "=a" (pkru), "=d" (edx)
+		     : "c" (ecx));
+	return pkru;
+}
+#else
+static inline u32 __read_pkru(void)
+{
+	return 0;
+}
+#endif
+
 static inline void native_wbinvd(void)
 {
 	asm volatile("wbinvd": : :"memory");
-- 
cgit 


From 1874f6895c92d991ccf85edcc55a0d9dd552d71c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:18 -0800
Subject: x86/mm/gup: Simplify get_user_pages() PTE bit handling

The current get_user_pages() code is a wee bit more complicated
than it needs to be for pte bit checking.  Currently, it establishes
a mask of required pte _PAGE_* bits and ensures that the pte it
goes after has all those bits.

This consolidates the three identical copies of this code.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210218.3A2D4045@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/gup.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ce5e4545203b..2f0a32945cda 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -74,6 +74,24 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
 	}
 }
 
+/*
+ * 'pteval' can come from a pte, pmd or pud.  We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline int pte_allows_gup(unsigned long pteval, int write)
+{
+	unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+	if (write)
+		need_pte_bits |= _PAGE_RW;
+
+	if ((pteval & need_pte_bits) != need_pte_bits)
+		return 0;
+
+	return 1;
+}
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -83,14 +101,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
 	struct dev_pagemap *pgmap = NULL;
-	unsigned long mask;
 	int nr_start = *nr;
 	pte_t *ptep;
 
-	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-
 	ptep = pte_offset_map(&pmd, addr);
 	do {
 		pte_t pte = gup_get_pte(ptep);
@@ -110,7 +123,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 				pte_unmap(ptep);
 				return 0;
 			}
-		} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+		} else if (!pte_allows_gup(pte_val(pte), write) ||
+			   pte_special(pte)) {
 			pte_unmap(ptep);
 			return 0;
 		}
@@ -164,14 +178,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-	unsigned long mask;
 	struct page *head, *page;
 	int refs;
 
-	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-	if ((pmd_flags(pmd) & mask) != mask)
+	if (!pte_allows_gup(pmd_val(pmd), write))
 		return 0;
 
 	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +241,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-	unsigned long mask;
 	struct page *head, *page;
 	int refs;
 
-	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-	if ((pud_flags(pud) & mask) != mask)
+	if (!pte_allows_gup(pud_val(pud), write))
 		return 0;
 	/* hugepages are never "special" */
 	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-- 
cgit 


From 33a709b25a760b91184bb335cf7d7c32b8123013 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:19 -0800
Subject: mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys

Today, for normal faults and page table walks, we check the VMA
and/or PTE to ensure that it is compatible with the action.  For
instance, if we get a write fault on a non-writeable VMA, we
SIGSEGV.

We try to do the same thing for protection keys.  Basically, we
try to make sure that if a user does this:

	mprotect(ptr, size, PROT_NONE);
	*ptr = foo;

they see the same effects with protection keys when they do this:

	mprotect(ptr, size, PROT_READ|PROT_WRITE);
	set_pkey(ptr, size, 4);
	wrpkru(0xffffff3f); // access disable pkey 4
	*ptr = foo;

The state to do that checking is in the VMA, but we also
sometimes have to do it on the page tables only, like when doing
a get_user_pages_fast() where we have no VMA.

We add two functions and expose them to generic code:

	arch_pte_access_permitted(pte_flags, write)
	arch_vma_access_permitted(vma, write)

These are, of course, backed up in x86 arch code with checks
against the PTE or VMA's protection key.

But, there are also cases where we do not want to respect
protection keys.  When we ptrace(), for instance, we do not want
to apply the tracer's PKRU permissions to the PTEs from the
process being traced.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Dominik Vogt <vogt@linux.vnet.ibm.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160212210219.14D5D715@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 49 ++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable.h     | 29 ++++++++++++++++++++++
 arch/x86/mm/fault.c                | 21 +++++++++++++++-
 arch/x86/mm/gup.c                  |  5 ++++
 4 files changed, 103 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 94c4c8b5cb8f..19036cdbed8f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,4 +286,53 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 	return pkey;
 }
 
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+	u32 pkru = read_pkru();
+
+	if (!__pkru_allows_read(pkru, pkey))
+		return false;
+	if (write && !__pkru_allows_write(pkru, pkey))
+		return false;
+
+	return true;
+}
+
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to PKRU for other
+ * processes or any way to tell *which * PKRU in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+	if (!current->mm)
+		return true;
+	/*
+	 * Should PKRU be enforced on the access to this VMA?  If
+	 * the VMA is from another process, then PKRU has no
+	 * relevance and should not be enforced.
+	 */
+	if (current->mm != vma->vm_mm)
+		return true;
+
+	return false;
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write)
+{
+	/* allow access if the VMA is not one from this process */
+	if (vma_is_foreign(vma))
+		return true;
+	return __pkru_allows_pkey(vma_pkey(vma), write);
+}
+
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+	return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
+}
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e997dcc6ee2b..3cbfae80abb2 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -919,6 +919,35 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 }
 #endif
 
+#define PKRU_AD_BIT 0x1
+#define PKRU_WD_BIT 0x2
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * 2;
+	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+	int pkru_pkey_bits = pkey * 2;
+	/*
+	 * Access-disable disables writes too so we need to check
+	 * both bits here.
+	 */
+	return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u16 pte_flags_pkey(unsigned long pte_flags)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+	/* ifdef to avoid doing 59-bit shift on 32-bit values */
+	return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
+#else
+	return 0;
+#endif
+}
+
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6e71dcf699ab..319331afae24 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -897,6 +897,16 @@ bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 	__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
 }
 
+static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+		struct vm_area_struct *vma)
+{
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return false;
+	if (error_code & PF_PK)
+		return true;
+	return false;
+}
+
 static noinline void
 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 		      unsigned long address, struct vm_area_struct *vma)
@@ -906,7 +916,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 	 * But, doing it this way allows compiler optimizations
 	 * if pkeys are compiled out.
 	 */
-	if (boot_cpu_has(X86_FEATURE_OSPKE) && (error_code & PF_PK))
+	if (bad_area_access_from_pkeys(error_code, vma))
 		__bad_area(regs, error_code, address, vma, SEGV_PKUERR);
 	else
 		__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
@@ -1081,6 +1091,15 @@ int show_unhandled_signals = 1;
 static inline int
 access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
+	/*
+	 * Access or read was blocked by protection keys. We do
+	 * this check before any others because we do not want
+	 * to, for instance, confuse a protection-key-denied
+	 * write with one for which we should do a COW.
+	 */
+	if (error_code & PF_PK)
+		return 1;
+
 	if (error_code & PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 2f0a32945cda..bab259e75984 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/memremap.h>
 
+#include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 
 static inline pte_t gup_get_pte(pte_t *ptep)
@@ -89,6 +90,10 @@ static inline int pte_allows_gup(unsigned long pteval, int write)
 	if ((pteval & need_pte_bits) != need_pte_bits)
 		return 0;
 
+	/* Check memory protection keys permissions. */
+	if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
+		return 0;
+
 	return 1;
 }
 
-- 
cgit 


From 1b2ee1266ea647713dbaf44825967c180dfc8d76 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:21 -0800
Subject: mm/core: Do not enforce PKEY permissions on remote mm access

We try to enforce protection keys in software the same way that we
do in hardware.  (See long example below).

But, we only want to do this when accessing our *own* process's
memory.  If GDB set PKRU[6].AD=1 (disable access to PKEY 6), then
tried to PTRACE_POKE a target process which just happened to have
some mprotect_pkey(pkey=6) memory, we do *not* want to deny the
debugger access to that memory.  PKRU is fundamentally a
thread-local structure and we do not want to enforce it on access
to _another_ thread's data.

This gets especially tricky when we have workqueues or other
delayed-work mechanisms that might run in a random process's context.
We can check that we only enforce pkeys when operating on our *own* mm,
but delayed work gets performed when a random user context is active.
We might end up with a situation where a delayed-work gup fails when
running randomly under its "own" task but succeeds when running under
another process.  We want to avoid that.

To avoid that, we use the new GUP flag: FOLL_REMOTE and add a
fault flag: FAULT_FLAG_REMOTE.  They indicate that we are
walking an mm which is not guranteed to be the same as
current->mm and should not be subject to protection key
enforcement.

Thanks to Jerome Glisse for pointing out this scenario.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Dominik Vogt <vogt@linux.vnet.ibm.com>
Cc: Eric B Munson <emunson@akamai.com>
Cc: Geliang Tang <geliangtang@163.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Low <jason.low2@hp.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: iommu@lists.linux-foundation.org
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 19036cdbed8f..b4d939a17e60 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -322,10 +322,11 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
 	return false;
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+		bool write, bool foreign)
 {
 	/* allow access if the VMA is not one from this process */
-	if (vma_is_foreign(vma))
+	if (foreign || vma_is_foreign(vma))
 		return true;
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
-- 
cgit 


From 07f146f53e8de826e4afa3a88ea65bdb13c24959 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:22 -0800
Subject: x86/mm/pkeys: Optimize fault handling in access_error()

We might not strictly have to make modifictions to
access_error() to check the VMA here.

If we do not, we will do this:

 1. app sets VMA pkey to K
 2. app touches a !present page
 3. do_page_fault(), allocates and maps page, sets pte.pkey=K
 4. return to userspace
 5. touch instruction reexecutes, but triggers PF_PK
 6. do PKEY signal

What happens with this patch applied:

 1. app sets VMA pkey to K
 2. app touches a !present page
 3. do_page_fault() notices that K is inaccessible
 4. do PKEY signal

We basically skip the fault that does an allocation.

So what this lets us do is protect areas from even being
*populated* unless it is accessible according to protection
keys.  That seems handy to me and makes protection keys work
more like an mprotect()'d mapping.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210222.EBB63D8C@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 319331afae24..68ecdffe284e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -900,10 +900,16 @@ bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 		struct vm_area_struct *vma)
 {
+	/* This code is always called on the current mm */
+	bool foreign = false;
+
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return false;
 	if (error_code & PF_PK)
 		return true;
+	/* this checks permission keys on the VMA: */
+	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign))
+		return true;
 	return false;
 }
 
@@ -1091,6 +1097,8 @@ int show_unhandled_signals = 1;
 static inline int
 access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
+	/* This is only called for the current mm, so: */
+	bool foreign = false;
 	/*
 	 * Access or read was blocked by protection keys. We do
 	 * this check before any others because we do not want
@@ -1099,6 +1107,13 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 */
 	if (error_code & PF_PK)
 		return 1;
+	/*
+	 * Make sure to check the VMA so that we do not perform
+	 * faults just to hit a PF_PK as soon as we fill in a
+	 * page.
+	 */
+	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign))
+		return 1;
 
 	if (error_code & PF_WRITE) {
 		/* write, present and write, not present: */
-- 
cgit 


From d61172b4b695b821388cdb6088a41d431bcbb93b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:24 -0800
Subject: mm/core, x86/mm/pkeys: Differentiate instruction fetches

As discussed earlier, we attempt to enforce protection keys in
software.

However, the code checks all faults to ensure that they are not
violating protection key permissions.  It was assumed that all
faults are either write faults where we check PKRU[key].WD (write
disable) or read faults where we check the AD (access disable)
bit.

But, there is a third category of faults for protection keys:
instruction faults.  Instruction faults never run afoul of
protection keys because they do not affect instruction fetches.

So, plumb the PF_INSTR bit down in to the
arch_vma_access_permitted() function where we do the protection
key checks.

We also add a new FAULT_FLAG_INSTRUCTION.  This is because
handle_mm_fault() is not passed the architecture-specific
error_code where we keep PF_INSTR, so we need to encode the
instruction fetch information in to the arch-generic fault
flags.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210224.96928009@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 5 ++++-
 arch/x86/mm/fault.c                | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b4d939a17e60..6572b949cbca 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -323,8 +323,11 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
 }
 
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
-		bool write, bool foreign)
+		bool write, bool execute, bool foreign)
 {
+	/* pkeys never affect instruction fetches */
+	if (execute)
+		return true;
 	/* allow access if the VMA is not one from this process */
 	if (foreign || vma_is_foreign(vma))
 		return true;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 68ecdffe284e..d81744e6f39f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -908,7 +908,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 	if (error_code & PF_PK)
 		return true;
 	/* this checks permission keys on the VMA: */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+				(error_code & PF_INSTR), foreign))
 		return true;
 	return false;
 }
@@ -1112,7 +1113,8 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 * faults just to hit a PF_PK as soon as we fill in a
 	 * page.
 	 */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+				(error_code & PF_INSTR), foreign))
 		return 1;
 
 	if (error_code & PF_WRITE) {
@@ -1267,6 +1269,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	if (error_code & PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
+	if (error_code & PF_INSTR)
+		flags |= FAULT_FLAG_INSTRUCTION;
 
 	/*
 	 * When running in the kernel we expect faults to occur only to
-- 
cgit 


From c0b17b5bd4b7b98e7c6b67c9f69343b64711271b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:25 -0800
Subject: x86/mm/pkeys: Dump PKRU with other kernel registers

Protection Keys never affect kernel mappings.  But, they can
affect whether the kernel will fault when it touches a user
mapping.  The kernel doesn't touch user mappings without some
careful choreography and these accesses don't generally result in
oopses.  But, if one does, we definitely want to have PKRU
available so we can figure out if protection keys played a role.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210225.BF0D4482@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process_64.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..776229e98202 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -116,6 +116,8 @@ void __show_regs(struct pt_regs *regs, int all)
 	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 
+	if (boot_cpu_has(X86_FEATURE_OSPKE))
+		printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 }
 
 void release_thread(struct task_struct *dead_task)
-- 
cgit 


From c1192f8428414679c8126180e690f8daa1d4d98a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:27 -0800
Subject: x86/mm/pkeys: Dump pkey from VMA in /proc/pid/smaps

The protection key can now be just as important as read/write
permissions on a VMA.  We need some debug mechanism to help
figure out if it is in play.  smaps seems like a logical
place to expose it.

arch/x86/kernel/setup.c is a bit of a weirdo place to put
this code, but it already had seq_file.h and there was not
a much better existing place to put it.

We also use no #ifdef.  If protection keys is .config'd out we
will effectively get the same function as if we used the weak
generic function.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Mark Williamson <mwilliamson@undo-software.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210227.4F8EB3F8@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3d80e6d42a2..7260f992cd11 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
 #include <asm/alternative.h>
 #include <asm/prom.h>
 #include <asm/microcode.h>
+#include <asm/mmu_context.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void)
 	return 0;
 }
 __initcall(register_kernel_offset_dumper);
+
+void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return;
+
+	seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
+}
-- 
cgit 


From 284244a9876225eb73102aff41d4492f65cb2868 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:28 -0800
Subject: x86/mm/pkeys: Add Kconfig prompt to existing config option

I don't have a strong opinion on whether we need this or not.
Protection Keys has relatively little code associated with it,
and it is not a heavyweight feature to keep enabled.  However,
I can imagine that folks would still appreciate being able to
disable it.

Here's the option if folks want it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210228.7E79386C@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb2ebeb9a692..b8754348de4d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1716,8 +1716,18 @@ config X86_INTEL_MPX
 	  If unsure, say N.
 
 config X86_INTEL_MEMORY_PROTECTION_KEYS
+	prompt "Intel Memory Protection Keys"
 	def_bool y
+	# Note: only available in 64-bit mode
 	depends on CPU_SUP_INTEL && X86_64
+	---help---
+	  Memory Protection Keys provides a mechanism for enforcing
+	  page-based protections, but without requiring modification of the
+	  page tables when an application changes protection domains.
+
+	  For details, see Documentation/x86/protection-keys.txt
+
+	  If unsure, say y.
 
 config EFI
 	bool "EFI runtime service support"
-- 
cgit 


From 0697694564c84f4c9320e5d103d0191297a20023 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:29 -0800
Subject: x86/mm/pkeys: Actually enable Memory Protection Keys in the CPU

This sets the bit in 'cr4' to actually enable the protection
keys feature.  We also include a boot-time disable for the
feature "nopku".

Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE cpuid
bit to appear set.  At this point in boot, identify_cpu()
has already run the actual CPUID instructions and populated
the "cpu features" structures.  We need to go back and
re-run identify_cpu() to make sure it gets updated values.

We *could* simply re-populate the 11th word of the cpuid
data, but this is probably quick enough.

Also note that with the cpu_has() check and X86_FEATURE_PKU
present in disabled-features.h, we do not need an #ifdef
for setup_pku().

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210229.6708027C@viggo.jf.intel.com
[ Small readability edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a719ad7551d2..4fac2634ba19 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -303,6 +303,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+	if (!cpu_has(c, X86_FEATURE_PKU))
+		return;
+	if (pku_disabled)
+		return;
+
+	cr4_set_bits(X86_CR4_PKE);
+	/*
+	 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+	 * cpuid bit to be set.  We need to ensure that we
+	 * update that bit in this CPU's "cpu_info".
+	 */
+	get_cpu_cap(c);
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+	/*
+	 * Do not clear the X86_FEATURE_PKU bit.  All of the
+	 * runtime checks are against OSPKE so clearing the
+	 * bit does nothing.
+	 *
+	 * This way, we will see "pku" in cpuinfo, but not
+	 * "ospke", which is exactly what we want.  It shows
+	 * that the CPU has PKU, but the OS has not enabled it.
+	 * This happens to be exactly how a system would look
+	 * if we disabled the config option.
+	 */
+	pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+	pku_disabled = true;
+	return 1;
+}
+__setup("nopku", setup_disable_pku);
+#endif /* CONFIG_X86_64 */
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -960,6 +1002,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	init_hypervisor(c);
 	x86_init_rdrand(c);
 	x86_init_cache_qos(c);
+	setup_pku(c);
 
 	/*
 	 * Clear/Set all flags overriden by options, need do it
-- 
cgit 


From 66d375709d2c891acc639538fd3179fa0cbb0daf Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:32 -0800
Subject: mm/core, x86/mm/pkeys: Add arch_validate_pkey()

The syscall-level code is passed a protection key and need to
return an appropriate error code if the protection key is bogus.
We will be using this in subsequent patches.

Note that this also begins a series of arch-specific calls that
we need to expose in otherwise arch-independent code.  We create
a linux/pkeys.h header where we will put *all* the stubs for
these functions.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210232.774EEAAB@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig             | 1 +
 arch/x86/include/asm/pkeys.h | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 arch/x86/include/asm/pkeys.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b8754348de4d..eda18cecdbbd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -156,6 +156,7 @@ config X86
 	select X86_DEV_DMA_OPS			if X86_64
 	select X86_FEATURE_NAMES		if PROC_FS
 	select ARCH_USES_HIGH_VMA_FLAGS		if X86_INTEL_MEMORY_PROTECTION_KEYS
+	select ARCH_HAS_PKEYS			if X86_INTEL_MEMORY_PROTECTION_KEYS
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000000..04243c23380c
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_PKEYS_H
+#define _ASM_X86_PKEYS_H
+
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+
+#endif /*_ASM_X86_PKEYS_H */
-- 
cgit 


From 39a0526fb3f7d93433d146304278477eb463f8af Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:34 -0800
Subject: x86/mm: Factor out LDT init from context init

The arch-specific mm_context_t is a great place to put
protection-key allocation state.

But, we need to initialize the allocation state because pkey 0 is
always "allocated".  All of the runtime initialization of
mm_context_t is done in *_ldt() manipulation functions.  This
renames the existing LDT functions like this:

	init_new_context() -> init_new_context_ldt()
	destroy_context() -> destroy_context_ldt()

and makes init_new_context() and destroy_context() available for
generic use.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210234.DB34FCC5@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 21 ++++++++++++++++-----
 arch/x86/kernel/ldt.c              |  4 ++--
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 6572b949cbca..84280029cafd 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,15 +52,15 @@ struct ldt_struct {
 /*
  * Used for LDT copy/destruction.
  */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context_ldt(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context(struct task_struct *tsk,
-				   struct mm_struct *mm)
+static inline int init_new_context_ldt(struct task_struct *tsk,
+				       struct mm_struct *mm)
 {
 	return 0;
 }
-static inline void destroy_context(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) {}
 #endif
 
 static inline void load_mm_ldt(struct mm_struct *mm)
@@ -104,6 +104,17 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 #endif
 }
 
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	init_new_context_ldt(tsk, mm);
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm)
+{
+	destroy_context_ldt(mm);
+}
+
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6acc9dd91f36..6707039b9032 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
  * we do not have to muck with descriptors here, that is
  * done in switch_mm() as needed.
  */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct ldt_struct *new_ldt;
 	struct mm_struct *old_mm;
@@ -144,7 +144,7 @@ out_unlock:
  *
  * 64bit: Don't touch the LDT register - we're already in the next thread.
  */
-void destroy_context(struct mm_struct *mm)
+void destroy_context_ldt(struct mm_struct *mm)
 {
 	free_ldt_struct(mm->context.ldt);
 	mm->context.ldt = NULL;
-- 
cgit 


From b8b9b6ba9dec3f155c7555cb208ba4078e97aedb Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:35 -0800
Subject: x86/fpu: Allow setting of XSAVE state

We want to modify the Protection Key rights inside the kernel, so
we need to change PKRU's contents.  But, if we do a plain
'wrpkru', when we return to userspace we might do an XRSTOR and
wipe out the kernel's 'wrpkru'.  So, we need to go after PKRU in
the xsave buffer.

We do this by:

  1. Ensuring that we have the XSAVE registers (fpregs) in the
     kernel FPU buffer (fpstate)
  2. Looking up the location of a given state in the buffer
  3. Filling in the stat
  4. Ensuring that the hardware knows that state is present there
     (basically that the 'init optimization' is not in place).
  5. Copying the newly-modified state back to the registers if
     necessary.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210235.5A3139BF@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h |  2 +
 arch/x86/kernel/fpu/core.c          | 63 ++++++++++++++++++++++++
 arch/x86/kernel/fpu/xstate.c        | 98 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 161 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a2124343edf5..31ac8e6d9f36 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,6 +25,8 @@
 extern void fpu__activate_curr(struct fpu *fpu);
 extern void fpu__activate_fpstate_read(struct fpu *fpu);
 extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__current_fpstate_write_begin(void);
+extern void fpu__current_fpstate_write_end(void);
 extern void fpu__save(struct fpu *fpu);
 extern void fpu__restore(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 299b58bb975b..dea8e76d60c6 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -353,6 +353,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 	}
 }
 
+/*
+ * This function must be called before we write the current
+ * task's fpstate.
+ *
+ * This call gets the current FPU register state and moves
+ * it in to the 'fpstate'.  Preemption is disabled so that
+ * no writes to the 'fpstate' can occur from context
+ * swiches.
+ *
+ * Must be followed by a fpu__current_fpstate_write_end().
+ */
+void fpu__current_fpstate_write_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	/*
+	 * Ensure that the context-switching code does not write
+	 * over the fpstate while we are doing our update.
+	 */
+	preempt_disable();
+
+	/*
+	 * Move the fpregs in to the fpu's 'fpstate'.
+	 */
+	fpu__activate_fpstate_read(fpu);
+
+	/*
+	 * The caller is about to write to 'fpu'.  Ensure that no
+	 * CPU thinks that its fpregs match the fpstate.  This
+	 * ensures we will not be lazy and skip a XRSTOR in the
+	 * future.
+	 */
+	fpu->last_cpu = -1;
+}
+
+/*
+ * This function must be paired with fpu__current_fpstate_write_begin()
+ *
+ * This will ensure that the modified fpstate gets placed back in
+ * the fpregs if necessary.
+ *
+ * Note: This function may be called whether or not an _actual_
+ * write to the fpstate occurred.
+ */
+void fpu__current_fpstate_write_end(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	/*
+	 * 'fpu' now has an updated copy of the state, but the
+	 * registers may still be out of date.  Update them with
+	 * an XRSTOR if they are active.
+	 */
+	if (fpregs_active())
+		copy_kernel_to_fpregs(&fpu->state);
+
+	/*
+	 * Our update is done and the fpregs/fpstate are in sync
+	 * if necessary.  Context switches can happen again.
+	 */
+	preempt_enable();
+}
+
 /*
  * 'fpu__restore()' is called to copy FPU registers from
  * the FPU fpstate to the live hw registers and to activate
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index a63ca80fa723..30d144f01eb9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -678,6 +678,19 @@ void fpu__resume_cpu(void)
 		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
 }
 
+/*
+ * Given an xstate feature mask, calculate where in the xsave
+ * buffer the state is.  Callers should ensure that the buffer
+ * is valid.
+ *
+ * Note: does not work for compacted buffers.
+ */
+void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+{
+	int feature_nr = fls64(xstate_feature_mask) - 1;
+
+	return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
 /*
  * Given the xsave area and a state inside, this function returns the
  * address of the state.
@@ -698,7 +711,6 @@ void fpu__resume_cpu(void)
  */
 void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 {
-	int feature_nr = fls64(xstate_feature) - 1;
 	/*
 	 * Do we even *have* xsave state?
 	 */
@@ -726,7 +738,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	if (!(xsave->header.xfeatures & xstate_feature))
 		return NULL;
 
-	return (void *)xsave + xstate_comp_offsets[feature_nr];
+	return __raw_xsave_addr(xsave, xstate_feature);
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
 
@@ -761,3 +773,85 @@ const void *get_xsave_field_ptr(int xsave_state)
 
 	return get_xsave_addr(&fpu->state.xsave, xsave_state);
 }
+
+
+/*
+ * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
+ * to take out of its "init state".  This will ensure that an
+ * XRSTOR actually restores the state.
+ */
+static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
+		int xstate_feature_mask)
+{
+	xsave->header.xfeatures |= xstate_feature_mask;
+}
+
+/*
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ *	XFEATURE_MASK_SSE, etc...)
+ *	@xsave_state_ptr: a pointer to a copy of the state that you would
+ *	like written in to the current task's FPU xsave state.  This pointer
+ *	must not be located in the current tasks's xsave area.
+ * Output:
+ *	address of the state in the xsave area or NULL if the state
+ *	is not present or is in its 'init state'.
+ */
+static void fpu__xfeature_set_state(int xstate_feature_mask,
+		void *xstate_feature_src, size_t len)
+{
+	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+	struct fpu *fpu = &current->thread.fpu;
+	void *dst;
+
+	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+		WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
+		return;
+	}
+
+	/*
+	 * Tell the FPU code that we need the FPU state to be in
+	 * 'fpu' (not in the registers), and that we need it to
+	 * be stable while we write to it.
+	 */
+	fpu__current_fpstate_write_begin();
+
+	/*
+	 * This method *WILL* *NOT* work for compact-format
+	 * buffers.  If the 'xstate_feature_mask' is unset in
+	 * xcomp_bv then we may need to move other feature state
+	 * "up" in the buffer.
+	 */
+	if (xsave->header.xcomp_bv & xstate_feature_mask) {
+		WARN_ON_ONCE(1);
+		goto out;
+	}
+
+	/* find the location in the xsave buffer of the desired state */
+	dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
+
+	/*
+	 * Make sure that the pointer being passed in did not
+	 * come from the xsave buffer itself.
+	 */
+	WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
+
+	/* put the caller-provided data in the location */
+	memcpy(dst, xstate_feature_src, len);
+
+	/*
+	 * Mark the xfeature so that the CPU knows there is state
+	 * in the buffer now.
+	 */
+	fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
+out:
+	/*
+	 * We are done writing to the 'fpu'.  Reenable preeption
+	 * and (possibly) move the fpstate back in to the fpregs.
+	 */
+	fpu__current_fpstate_write_end();
+}
-- 
cgit 


From 8459429693395ca9e8d18101300b120ad9171795 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:36 -0800
Subject: x86/mm/pkeys: Allow kernel to modify user pkey rights register

The Protection Key Rights for User memory (PKRU) is a 32-bit
user-accessible register.  It contains two bits for each
protection key: one to write-disable (WD) access to memory
covered by the key and another to access-disable (AD).

Userspace can read/write the register with the RDPKRU and WRPKRU
instructions.  But, the register is saved and restored with the
XSAVE family of instructions, which means we have to treat it
like a floating point register.

The kernel needs to write to the register if it wants to
implement execute-only memory or if it implements a system call
to change PKRU.

To do this, we need to create a 'pkru_state' buffer, read the old
contents in to it, modify it, and then tell the FPU code that
there is modified data in there so it can (possibly) move the
buffer back in to the registers.

This uses the fpu__xfeature_set_state() function that we defined
in the previous patch.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210236.0BE13217@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h |  5 +--
 arch/x86/include/asm/pkeys.h   |  3 ++
 arch/x86/kernel/fpu/xstate.c   | 74 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3cbfae80abb2..1ff49ec29ece 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -921,16 +921,17 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 
 #define PKRU_AD_BIT 0x1
 #define PKRU_WD_BIT 0x2
+#define PKRU_BITS_PER_PKEY 2
 
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
 {
-	int pkru_pkey_bits = pkey * 2;
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
 	return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
 }
 
 static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
 {
-	int pkru_pkey_bits = pkey * 2;
+	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
 	/*
 	 * Access-disable disables writes too so we need to check
 	 * both bits here.
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 04243c23380c..5061aec2ed5e 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -3,4 +3,7 @@
 
 #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
 
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val);
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 30d144f01eb9..50813c35e9d9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
  */
 #include <linux/compat.h>
 #include <linux/cpu.h>
+#include <linux/pkeys.h>
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
@@ -855,3 +856,76 @@ out:
 	 */
 	fpu__current_fpstate_write_end();
 }
+
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+
+/*
+ * This will go out and modify the XSAVE buffer so that PKRU is
+ * set to a particular state for access to 'pkey'.
+ *
+ * PKRU state does affect kernel access to user memory.  We do
+ * not modfiy PKRU *itself* here, only the XSAVE state that will
+ * be restored in to PKRU when we return back to userspace.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+		unsigned long init_val)
+{
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct pkru_state *old_pkru_state;
+	struct pkru_state new_pkru_state;
+	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+	u32 new_pkru_bits = 0;
+
+	if (!validate_pkey(pkey))
+		return -EINVAL;
+	/*
+	 * This check implies XSAVE support.  OSPKE only gets
+	 * set if we enable XSAVE and we enable PKU in XCR0.
+	 */
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return -EINVAL;
+
+	/* Set the bits we need in PKRU  */
+	if (init_val & PKEY_DISABLE_ACCESS)
+		new_pkru_bits |= PKRU_AD_BIT;
+	if (init_val & PKEY_DISABLE_WRITE)
+		new_pkru_bits |= PKRU_WD_BIT;
+
+	/* Shift the bits in to the correct place in PKRU for pkey. */
+	new_pkru_bits <<= pkey_shift;
+
+	/* Locate old copy of the state in the xsave buffer */
+	old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+
+	/*
+	 * When state is not in the buffer, it is in the init
+	 * state, set it manually.  Otherwise, copy out the old
+	 * state.
+	 */
+	if (!old_pkru_state)
+		new_pkru_state.pkru = 0;
+	else
+		new_pkru_state.pkru = old_pkru_state->pkru;
+
+	/* mask off any old bits in place */
+	new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+	/* Set the newly-requested bits */
+	new_pkru_state.pkru |= new_pkru_bits;
+
+	/*
+	 * We could theoretically live without zeroing pkru.pad.
+	 * The current XSAVE feature state definition says that
+	 * only bytes 0->3 are used.  But we do not want to
+	 * chance leaking kernel stack out to userspace in case a
+	 * memcpy() of the whole xsave buffer was done.
+	 *
+	 * They're in the same cacheline anyway.
+	 */
+	new_pkru_state.pad = 0;
+
+	fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
+			sizeof(new_pkru_state));
+
+	return 0;
+}
-- 
cgit 


From 878ba03932d757ce4e954db4defec74a0de0435b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:37 -0800
Subject: x86/mm/pkeys: Create an x86 arch_calc_vm_prot_bits() for VMA flags

calc_vm_prot_bits() takes PROT_{READ,WRITE,EXECUTE} bits and
turns them in to the vma->vm_flags/VM_* bits.  We need to do a
similar thing for protection keys.

We take a protection key (4 bits) and encode it in to the 4
VM_PKEY_* bits.

Note: this code is not new.  It was simply a part of the
mprotect_pkey() patch in the past.  I broke it out for use
in the execute-only support.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210237.CFB94AD5@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/mman.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index e8562e0a2993..39bca7fac087 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -20,6 +20,12 @@
 		((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) |	\
 		((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) |	\
 		((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (		\
+		((key) & 0x1 ? VM_PKEY_BIT0 : 0) |      \
+		((key) & 0x2 ? VM_PKEY_BIT1 : 0) |      \
+		((key) & 0x4 ? VM_PKEY_BIT2 : 0) |      \
+		((key) & 0x8 ? VM_PKEY_BIT3 : 0))
 #endif
 
 #include <asm-generic/mman.h>
-- 
cgit 


From 62b5f7d013fc455b8db26cf01e421f4c0d264b92 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 12 Feb 2016 13:02:40 -0800
Subject: mm/core, x86/mm/pkeys: Add execute-only protection keys support

Protection keys provide new page-based protection in hardware.
But, they have an interesting attribute: they only affect data
accesses and never affect instruction fetches.  That means that
if we set up some memory which is set as "access-disabled" via
protection keys, we can still execute from it.

This patch uses protection keys to set up mappings to do just that.
If a user calls:

	mmap(..., PROT_EXEC);
or
	mprotect(ptr, sz, PROT_EXEC);

(note PROT_EXEC-only without PROT_READ/WRITE), the kernel will
notice this, and set a special protection key on the memory.  It
also sets the appropriate bits in the Protection Keys User Rights
(PKRU) register so that the memory becomes unreadable and
unwritable.

I haven't found any userspace that does this today.  With this
facility in place, we expect userspace to move to use it
eventually.  Userspace _could_ start doing this today.  Any
PROT_EXEC calls get converted to PROT_READ inside the kernel, and
would transparently be upgraded to "true" PROT_EXEC with this
code.  IOW, userspace never has to do any PROT_EXEC runtime
detection.

This feature provides enhanced protection against leaking
executable memory contents.  This helps thwart attacks which are
attempting to find ROP gadgets on the fly.

But, the security provided by this approach is not comprehensive.
The PKRU register which controls access permissions is a normal
user register writable from unprivileged userspace.  An attacker
who can execute the 'wrpkru' instruction can easily disable the
protection provided by this feature.

The protection key that is used for execute-only support is
permanently dedicated at compile time.  This is fine for now
because there is currently no API to set a protection key other
than this one.

Despite there being a constant PKRU value across the entire
system, we do not set it unless this feature is in use in a
process.  That is to preserve the PKRU XSAVE 'init state',
which can lead to faster context switches.

PKRU *is* a user register and the kernel is modifying it.  That
means that code doing:

	pkru = rdpkru()
	pkru |= 0x100;
	mmap(..., PROT_EXEC);
	wrpkru(pkru);

could lose the bits in PKRU that enforce execute-only
permissions.  To avoid this, we suggest avoiding ever calling
mmap() or mprotect() when the PKRU value is expected to be
unstable.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Piotr Kwapulinski <kwapulinski.piotr@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: keescook@google.com
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20160212210240.CB4BB5CA@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pkeys.h |  25 +++++++++++
 arch/x86/kernel/fpu/xstate.c |   2 -
 arch/x86/mm/Makefile         |   2 +
 arch/x86/mm/fault.c          |  10 +++++
 arch/x86/mm/pkeys.c          | 101 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/mm/pkeys.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 5061aec2ed5e..7b84565c916c 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -6,4 +6,29 @@
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+#define PKEY_DEDICATED_EXECUTE_ONLY 15
+extern int __execute_only_pkey(struct mm_struct *mm);
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return 0;
+
+	return __execute_only_pkey(mm);
+}
+
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+		int prot, int pkey);
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+		int prot, int pkey)
+{
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return 0;
+
+	return __arch_override_mprotect_pkey(vma, prot, pkey);
+}
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 50813c35e9d9..1b1981812bb6 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -877,8 +877,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
 	u32 new_pkru_bits = 0;
 
-	if (!validate_pkey(pkey))
-		return -EINVAL;
 	/*
 	 * This check implies XSAVE support.  OSPKE only gets
 	 * set if we enable XSAVE and we enable PKU in XCR0.
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..67cf2e1e557b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d81744e6f39f..5877b92ab6f1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1108,6 +1108,16 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 */
 	if (error_code & PF_PK)
 		return 1;
+
+	if (!(error_code & PF_INSTR)) {
+		/*
+		 * Assume all accesses require either read or execute
+		 * permissions.  This is not an instruction access, so
+		 * it requires read permissions.
+		 */
+		if (!(vma->vm_flags & VM_READ))
+			return 1;
+	}
 	/*
 	 * Make sure to check the VMA so that we do not perform
 	 * faults just to hit a PF_PK as soon as we fill in a
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000000..e8c474451928
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,101 @@
+/*
+ * Intel Memory Protection Keys management
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/mm_types.h>             /* mm_struct, vma, etc...       */
+#include <linux/pkeys.h>                /* PKEY_*                       */
+#include <uapi/asm-generic/mman-common.h>
+
+#include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
+#include <asm/mmu_context.h>            /* vma_pkey()                   */
+#include <asm/fpu/internal.h>           /* fpregs_active()              */
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+	int ret;
+
+	/*
+	 * We do not want to go through the relatively costly
+	 * dance to set PKRU if we do not need to.  Check it
+	 * first and assume that if the execute-only pkey is
+	 * write-disabled that we do not have to set it
+	 * ourselves.  We need preempt off so that nobody
+	 * can make fpregs inactive.
+	 */
+	preempt_disable();
+	if (fpregs_active() &&
+	    !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+		preempt_enable();
+		return PKEY_DEDICATED_EXECUTE_ONLY;
+	}
+	preempt_enable();
+	ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+			PKEY_DISABLE_ACCESS);
+	/*
+	 * If the PKRU-set operation failed somehow, just return
+	 * 0 and effectively disable execute-only support.
+	 */
+	if (ret)
+		return 0;
+
+	return PKEY_DEDICATED_EXECUTE_ONLY;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+	/* Do this check first since the vm_flags should be hot */
+	if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+		return false;
+	if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+		return false;
+
+	return true;
+}
+
+/*
+ * This is only called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
+{
+	/*
+	 * Is this an mprotect_pkey() call?  If so, never
+	 * override the value that came from the user.
+	 */
+	if (pkey != -1)
+		return pkey;
+	/*
+	 * Look for a protection-key-drive execute-only mapping
+	 * which is now being given permissions that are not
+	 * execute-only.  Move it back to the default pkey.
+	 */
+	if (vma_is_pkey_exec_only(vma) &&
+	    (prot & (PROT_READ|PROT_WRITE))) {
+		return 0;
+	}
+	/*
+	 * The mapping is execute-only.  Go try to get the
+	 * execute-only protection key.  If we fail to do that,
+	 * fall through as if we do not have execute-only
+	 * support.
+	 */
+	if (prot == PROT_EXEC) {
+		pkey = execute_only_pkey(vma->vm_mm);
+		if (pkey > 0)
+			return pkey;
+	}
+	/*
+	 * This is a vanilla, non-pkey mprotect (or we failed to
+	 * setup execute-only), inherit the pkey from the VMA we
+	 * are working on.
+	 */
+	return vma_pkey(vma);
+}
-- 
cgit 


From 568b329a02f75ed3aaae5eb2cca384cb9e09cb29 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 17 Feb 2016 19:58:57 -0800
Subject: perf: generalize perf_callchain

. avoid walking the stack when there is no room left in the buffer
. generalize get_perf_callchain() to be called from bpf helper

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/stacktrace.h |  2 +-
 arch/x86/kernel/cpu/perf_event.c  |  4 ++--
 arch/x86/kernel/dumpstack.c       |  6 ++++--
 arch/x86/kernel/stacktrace.c      | 18 +++++++++++-------
 arch/x86/oprofile/backtrace.c     |  3 ++-
 5 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 70bbe39043a9..7c247e7404be 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
 /* Generic stack tracer with callbacks */
 
 struct stacktrace_ops {
-	void (*address)(void *data, unsigned long address, int reliable);
+	int (*address)(void *data, unsigned long address, int reliable);
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
 	walk_stack_t	walk_stack;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b443db2db50..d276b31ca473 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2180,11 +2180,11 @@ static int backtrace_stack(void *data, char *name)
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	perf_callchain_store(entry, addr);
+	return perf_callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acfadae2..0d1ff4b407d4 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
 		if (!__kernel_text_address(addr))
 			break;
 
-		ops->address(data, addr, 1);
+		if (ops->address(data, addr, 1))
+			break;
 		frame = frame->next_frame;
 		ret_addr = &frame->return_address;
 		print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
@@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
 /*
  * Print one address/symbol entries per line.
  */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+static int print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	touch_nmi_watchdog();
 	printk_stack_address(addr, reliable, data);
+	return 0;
 }
 
 static const struct stacktrace_ops print_trace_ops = {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index fdd0c6430e5a..9ee98eefc44d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
 	return 0;
 }
 
-static void
+static int
 __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
 {
 	struct stack_trace *trace = data;
 #ifdef CONFIG_FRAME_POINTER
 	if (!reliable)
-		return;
+		return 0;
 #endif
 	if (nosched && in_sched_functions(addr))
-		return;
+		return 0;
 	if (trace->skip > 0) {
 		trace->skip--;
-		return;
+		return 0;
 	}
-	if (trace->nr_entries < trace->max_entries)
+	if (trace->nr_entries < trace->max_entries) {
 		trace->entries[trace->nr_entries++] = addr;
+		return 0;
+	} else {
+		return -1; /* no more room, stop walking the stack */
+	}
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static int save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	return __save_stack_address(data, addr, reliable, false);
 }
 
-static void
+static int
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
 	return __save_stack_address(data, addr, reliable, true);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 4e664bdb535a..cb31a4440e58 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	unsigned int *depth = data;
 
 	if ((*depth)--)
 		oprofile_add_trace(addr);
+	return 0;
 }
 
 static struct stacktrace_ops backtrace_ops = {
-- 
cgit 


From b176862fca8625b0a8bee207bca9b611413e5e24 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 18 Feb 2016 21:00:41 +0100
Subject: x86/mm/ptdump: Remove paravirt_enabled()

is_hypervisor_range() can simply check if the PGD index is
within ffff800000000000 - ffff87ffffffffff which is the range
reserved for a hypervisor. That range is practically an ABI, see
Documentation/x86/x86_64/mm.txt.

Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Under Xen, as PV guest
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455825641-19585-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 4a6f1d9b5106..99bfb192803f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
 #endif
 
-#ifdef CONFIG_X86_64
 static inline bool is_hypervisor_range(int idx)
 {
+#ifdef CONFIG_X86_64
 	/*
 	 * ffff800000000000 - ffff87ffffffffff is reserved for
 	 * the hypervisor.
 	 */
-	return paravirt_enabled() &&
-		(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
-		(idx < pgd_index(__PAGE_OFFSET));
-}
+	return	(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+		(idx <  pgd_index(__PAGE_OFFSET));
 #else
-static inline bool is_hypervisor_range(int idx) { return false; }
+	return false;
 #endif
+}
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 				       bool checkwx)
-- 
cgit 


From c25323c07345a843a56a294047b130dfd9250fad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 18 Feb 2016 20:53:43 +0100
Subject: x86/tsc: Use topology functions

It's simpler to look at the topology mask than iterating over all online cpus
to find a cpu on the same package.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/kernel/tsc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da828d3..5a6cb4684e0f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1246,14 +1246,14 @@ void __init tsc_init(void)
  */
 unsigned long calibrate_delay_is_known(void)
 {
-	int i, cpu = smp_processor_id();
+	int sibling, cpu = smp_processor_id();
 
 	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
 		return 0;
 
-	for_each_online_cpu(i)
-		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
-			return cpu_data(i).loops_per_jiffy;
+	sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+	if (sibling < nr_cpu_ids)
+		return cpu_data(sibling).loops_per_jiffy;
 	return 0;
 }
 #endif
-- 
cgit 


From 397630150632639b3ca5b4414accd5011c45e276 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 17 Feb 2016 12:35:56 +0000
Subject: x86/mm/pat: Use _PAGE_GLOBAL bit for EFI page table mappings

Since EFI page tables can be treated as kernel page tables they should
be global. All the other page mapping functions in pageattr.c set the
_PAGE_GLOBAL bit and we want to avoid inconsistencies when we map a page
in the EFI code paths, for example when that page is split in
__split_large_page(), etc. It also makes it easier to validate that the
EFI region mappings have the correct attributes because there are fewer
differences compared with regular kernel mappings.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1455712566-16727-4-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pageattr.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 632d34d20237..bf312da41a6d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -909,6 +909,20 @@ static void populate_pte(struct cpa_data *cpa,
 
 	pte = pte_offset_kernel(pmd, start);
 
+	/*
+	 * Set the GLOBAL flags only if the PRESENT flag is
+	 * set otherwise pte_present will return true even on
+	 * a non present pte. The canon_pgprot will clear
+	 * _PAGE_GLOBAL for the ancient hardware that doesn't
+	 * support it.
+	 */
+	if (pgprot_val(pgprot) & _PAGE_PRESENT)
+		pgprot_val(pgprot) |= _PAGE_GLOBAL;
+	else
+		pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
+
+	pgprot = canon_pgprot(pgprot);
+
 	while (num_pages-- && start < end) {
 		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 
-- 
cgit 


From 15f003d20782a4079e078d16df57081ebd1fc150 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 17 Feb 2016 12:36:04 +0000
Subject: x86/mm/pat: Don't implicitly allow _PAGE_RW in
 kernel_map_pages_in_pgd()

As part of the preparation for the EFI_MEMORY_RO flag added in the UEFI
2.5 specification, we need the ability to map pages in kernel page
tables without _PAGE_RW being set.

Modify kernel_map_pages_in_pgd() to require its callers to pass _PAGE_RW
if the pages need to be mapped read/write. Otherwise, we'll map the
pages as read-only.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lee, Chun-Yi <jlee@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1455712566-16727-12-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pageattr.c         | 3 +++
 arch/x86/platform/efi/efi_64.c | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bf312da41a6d..14c38ae80409 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1971,6 +1971,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 	if (!(page_flags & _PAGE_NX))
 		cpa.mask_clr = __pgprot(_PAGE_NX);
 
+	if (!(page_flags & _PAGE_RW))
+		cpa.mask_clr = __pgprot(_PAGE_RW);
+
 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 
 	retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b492521503fe..b0965b27e47f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -233,7 +233,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * phys_efi_set_virtual_address_map().
 	 */
 	pfn = pa_memmap >> PAGE_SHIFT;
-	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX)) {
+	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
 		pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
 		return 1;
 	}
@@ -262,7 +262,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 		pfn = md->phys_addr >> PAGE_SHIFT;
 		npages = md->num_pages;
 
-		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, 0)) {
+		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) {
 			pr_err("Failed to map 1:1 memory\n");
 			return 1;
 		}
@@ -279,7 +279,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	text = __pa(_text);
 	pfn = text >> PAGE_SHIFT;
 
-	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, 0)) {
+	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
 		pr_err("Failed to map kernel text 1:1\n");
 		return 1;
 	}
@@ -294,7 +294,7 @@ void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 
 static void __init __map_region(efi_memory_desc_t *md, u64 va)
 {
-	unsigned long flags = 0;
+	unsigned long flags = _PAGE_RW;
 	unsigned long pfn;
 	pgd_t *pgd = efi_pgd;
 
-- 
cgit 


From 6d0cc887d571e96f928be83f094322451fd4bf6f Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 17 Feb 2016 12:36:05 +0000
Subject: x86/efi: Map EFI_MEMORY_{XP,RO} memory region bits to EFI page tables

Now that we have EFI memory region bits that indicate which regions do
not need execute permission or read/write permission in the page tables,
let's use them.

We also check for EFI_NX_PE_DATA and only enforce the restrictive
mappings if it's present (to allow us to ignore buggy firmware that sets
bits it didn't mean to and to preserve backwards compatibility).

Instead of assuming that firmware would set appropriate attributes in
memory descriptor like EFI_MEMORY_RO for code and EFI_MEMORY_XP for
data, we can expect some firmware out there which might only set *type*
in memory descriptor to be EFI_RUNTIME_SERVICES_CODE or
EFI_RUNTIME_SERVICES_DATA leaving away attribute. This will lead to
improper mappings of EFI runtime regions. In order to avoid it, we check
attribute and type of memory descriptor to update mappings and moreover
Windows works this way.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lee, Chun-Yi <jlee@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1455712566-16727-13-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/efi.h     |  2 +-
 arch/x86/platform/efi/efi.c    |  9 +++++++--
 arch/x86/platform/efi/efi_32.c |  2 +-
 arch/x86/platform/efi/efi_64.c | 45 ++++++++++++++++++++++++++++++++++++++----
 4 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8fd9e637629a..7bb206f73915 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -141,7 +141,7 @@ extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pa
 extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
 extern void __init old_map_region(efi_memory_desc_t *md);
 extern void __init runtime_code_page_mkexec(void);
-extern void __init efi_runtime_mkexec(void);
+extern void __init efi_runtime_update_mappings(void);
 extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e80826e6f3a9..994a7df84a7b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -934,7 +934,6 @@ static void __init __efi_enter_virtual_mode(void)
 	}
 
 	efi_sync_low_kernel_mappings();
-	efi_dump_pagetable();
 
 	if (efi_is_native()) {
 		status = phys_efi_set_virtual_address_map(
@@ -972,7 +971,13 @@ static void __init __efi_enter_virtual_mode(void)
 
 	efi.set_virtual_address_map = NULL;
 
-	efi_runtime_mkexec();
+	/*
+	 * Apply more restrictive page table mapping attributes now that
+	 * SVAM() has been called and the firmware has performed all
+	 * necessary relocation fixups for the new virtual addresses.
+	 */
+	efi_runtime_update_mappings();
+	efi_dump_pagetable();
 
 	/*
 	 * We mapped the descriptor array into the EFI pagetable above
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 58d669bc8250..338402b91d2e 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -90,7 +90,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	__flush_tlb_all();
 }
 
-void __init efi_runtime_mkexec(void)
+void __init efi_runtime_update_mappings(void)
 {
 	if (__supported_pte_mask & _PAGE_NX)
 		runtime_code_page_mkexec();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b0965b27e47f..40d2f447a9dd 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -393,13 +393,50 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len)
 	efi_setup = phys_addr + sizeof(struct setup_data);
 }
 
-void __init efi_runtime_mkexec(void)
+void __init efi_runtime_update_mappings(void)
 {
-	if (!efi_enabled(EFI_OLD_MEMMAP))
+	unsigned long pfn;
+	pgd_t *pgd = efi_pgd;
+	efi_memory_desc_t *md;
+	void *p;
+
+	if (efi_enabled(EFI_OLD_MEMMAP)) {
+		if (__supported_pte_mask & _PAGE_NX)
+			runtime_code_page_mkexec();
+		return;
+	}
+
+	if (!efi_enabled(EFI_NX_PE_DATA))
 		return;
 
-	if (__supported_pte_mask & _PAGE_NX)
-		runtime_code_page_mkexec();
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		unsigned long pf = 0;
+		md = p;
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+
+		if (!(md->attribute & EFI_MEMORY_WB))
+			pf |= _PAGE_PCD;
+
+		if ((md->attribute & EFI_MEMORY_XP) ||
+			(md->type == EFI_RUNTIME_SERVICES_DATA))
+			pf |= _PAGE_NX;
+
+		if (!(md->attribute & EFI_MEMORY_RO) &&
+			(md->type != EFI_RUNTIME_SERVICES_CODE))
+			pf |= _PAGE_RW;
+
+		/* Update the 1:1 mapping */
+		pfn = md->phys_addr >> PAGE_SHIFT;
+		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf))
+			pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+				   md->phys_addr, md->virt_addr);
+
+		if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf))
+			pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+				   md->phys_addr, md->virt_addr);
+	}
 }
 
 void __init efi_dump_pagetable(void)
-- 
cgit 


From 2ad510dc372c2caac9aada9ff6dd10e787616e1d Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 17 Feb 2016 12:36:06 +0000
Subject: x86/efi: Only map kernel text for EFI mixed mode

The correct symbol to use when figuring out the size of the kernel
text is '_etext', not '_end' which is the symbol for the entire kernel
image includes data and debug sections.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1455712566-16727-14-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 40d2f447a9dd..49e4dd4a1f58 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -275,7 +275,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	efi_scratch.phys_stack = virt_to_phys(page_address(page));
 	efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
 
-	npages = (_end - _text) >> PAGE_SHIFT;
+	npages = (_etext - _text) >> PAGE_SHIFT;
 	text = __pa(_text);
 	pfn = text >> PAGE_SHIFT;
 
-- 
cgit 


From e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:12 -0800
Subject: asm-generic: Consolidate mark_rodata_ro()

Instead of defining mark_rodata_ro() in each architecture, consolidate it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Gross <agross@codeaurora.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ashok Kumar <ashoks@broadcom.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Brown <david.brown@linaro.org>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cacheflush.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e63aa38e85fb..c8cff75c5b21 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -92,7 +92,6 @@ void clflush_cache_range(void *addr, unsigned int size);
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
-- 
cgit 


From 9ccaf77cf05915f51231d158abfd5448aedde758 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:14 -0800
Subject: x86/mm: Always enable CONFIG_DEBUG_RODATA and remove the Kconfig
 option

This removes the CONFIG_DEBUG_RODATA option and makes it always enabled.

This simplifies the code and also makes it clearer that read-only mapped
memory is just as fundamental a security feature in kernel-space as it is
in user-space.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-4-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                  |  3 +++
 arch/x86/Kconfig.debug            | 18 +++---------------
 arch/x86/include/asm/cacheflush.h |  5 -----
 arch/x86/include/asm/kvm_para.h   |  7 -------
 arch/x86/include/asm/sections.h   |  2 +-
 arch/x86/kernel/ftrace.c          |  6 +++---
 arch/x86/kernel/kgdb.c            |  8 ++------
 arch/x86/kernel/test_nx.c         |  2 --
 arch/x86/kernel/test_rodata.c     |  2 +-
 arch/x86/kernel/vmlinux.lds.S     | 25 +++++++++++--------------
 arch/x86/mm/init_32.c             |  3 ---
 arch/x86/mm/init_64.c             |  3 ---
 arch/x86/mm/pageattr.c            |  2 +-
 13 files changed, 25 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f64c39..b1051057e5b0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -303,6 +303,9 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config DEBUG_RODATA
+	def_bool y
+
 config PGTABLE_LEVELS
 	int
 	default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed97a8a2..7816b7b276f4 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA
-	bool "Write protect kernel read-only data structures"
-	default y
-	depends on DEBUG_KERNEL
-	---help---
-	  Mark the kernel read-only data as write-protected in the pagetables,
-	  in order to catch accidental (and incorrect) writes to such const
-	  data. This is recommended so that we can catch kernel bugs sooner.
-	  If in doubt, say "Y".
-
 config DEBUG_RODATA_TEST
-	bool "Testcase for the DEBUG_RODATA feature"
-	depends on DEBUG_RODATA
+	bool "Testcase for the marking rodata read-only"
 	default y
 	---help---
-	  This option enables a testcase for the DEBUG_RODATA
-	  feature as well as for the change_page_attr() infrastructure.
+	  This option enables a testcase for the setting rodata read-only
+	  as well as for the change_page_attr() infrastructure.
 	  If in doubt, say "N"
 
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
-	depends on DEBUG_RODATA
 	select X86_PTDUMP_CORE
 	---help---
 	  Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index c8cff75c5b21..61518cf79437 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,15 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-#ifdef CONFIG_DEBUG_RODATA
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
 int rodata_test(void);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c1adf33fdd0d..bc62e7cbf1b1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 }
 #endif /* CONFIG_KVM_GUEST */
 
-#ifdef CONFIG_DEBUG_RODATA
 #define KVM_HYPERCALL \
         ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
 
 /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a5242428659..13b6cdd0af57 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 #endif
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 29408d6d6626..05c9e3f5b6d7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 static unsigned long text_ip_addr(unsigned long ip)
 {
 	/*
-	 * On x86_64, kernel text mappings are mapped read-only with
-	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-	 * of the kernel text mapping to modify the kernel text.
+	 * On x86_64, kernel text mappings are mapped read-only, so we use
+	 * the kernel identity mapping instead of the kernel text mapping
+	 * to modify the kernel text.
 	 *
 	 * For 32bit kernels, these mappings are same and we can use
 	 * kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 44256a62702b..ed15cd486d06 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 {
 	int err;
-#ifdef CONFIG_DEBUG_RODATA
 	char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
 
 	bpt->type = BP_BREAKPOINT;
 	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 		return err;
 	err = probe_kernel_write((char *)bpt->bpt_addr,
 				 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
 	if (!err)
 		return err;
 	/*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 	if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
 		return -EINVAL;
 	bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
 	return err;
 }
 
 int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 {
-#ifdef CONFIG_DEBUG_RODATA
 	int err;
 	char opc[BREAK_INSTR_SIZE];
 
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
 		goto knl_write;
 	return err;
+
 knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
 	return probe_kernel_write((char *)bpt->bpt_addr,
 				  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
 }
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce07e525..27538f183c3b 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
 	 * by the error message
 	 */
 
-#ifdef CONFIG_DEBUG_RODATA
 	/* Test 3: Check if the .rodata section is executable */
 	if (rodata_test_data != 0xC3) {
 		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
 		printk(KERN_ERR "test_nx: .rodata section is executable\n");
 		ret = -ENODEV;
 	}
-#endif
 
 #if 0
 	/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index 5ecbfe5099da..cb4a01b41e27 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
 }
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..fe133b710bef 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
  *
  * However, kernel identity mappings will have different RWX permissions
  * to the pages mapping to text and to the pages padding (which are freed) the
  * text section. Hence kernel identity mappings will be broken to smaller
  * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
  */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
 
-#define X64_ALIGN_DEBUG_RODATA_END				\
+#define X64_ALIGN_RODATA_END					\
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 
 #else
 
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
 
 #endif
 
@@ -112,13 +111,11 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
-#if defined(CONFIG_DEBUG_RODATA)
 	/* .text should occupy whole number of pages */
 	. = ALIGN(PAGE_SIZE);
-#endif
-	X64_ALIGN_DEBUG_RODATA_BEGIN
+	X64_ALIGN_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
-	X64_ALIGN_DEBUG_RODATA_END
+	X64_ALIGN_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3de61f9..2ebfbaf61142 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
 	if (__supported_pte_mask & _PAGE_NX)
 		debug_checkwx();
 }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5488d21123bd..a40b755c67e3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1074,7 +1074,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -1166,8 +1165,6 @@ void mark_rodata_ro(void)
 	debug_checkwx();
 }
 
-#endif
-
 int kern_addr_valid(unsigned long addr)
 {
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814b0069..2450488f39ef 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,7 +283,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 	/*
 	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 	 * kernel text mappings for the large page aligned text, rodata sections
-- 
cgit 


From 018ef8dcf3de5f62e2cc1a9273cc27e1c6ba8de5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:17 -0800
Subject: x86/vdso: Mark the vDSO code read-only after init

The vDSO does not need to be writable after __init, so mark it as
__ro_after_init. The result kills the exploit method of writing to the
vDSO from kernel space resulting in userspace executing the modified code,
as shown here to bypass SMEP restrictions: http://itszn.com/blog/?p=21

The memory map (with added vDSO address reporting) shows the vDSO moving
into read-only memory:

Before:
	[    0.143067] vDSO @ ffffffff82004000
	[    0.143551] vDSO @ ffffffff82006000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e05000      20K   ro             GLB NX pte
	0xffffffff81e05000-0xffffffff82000000    2028K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214f000    1340K   RW             GLB NX pte
	0xffffffff8214f000-0xffffffff82281000    1224K   RW                 NX pte
	0xffffffff82281000-0xffffffff82400000    1532K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

After:
	[    0.145062] vDSO @ ffffffff81da1000
	[    0.146057] vDSO @ ffffffff81da4000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e0b000      44K   ro             GLB NX pte
	0xffffffff81e0b000-0xffffffff82000000    2004K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214c000    1328K   RW             GLB NX pte
	0xffffffff8214c000-0xffffffff8227e000    1224K   RW                 NX pte
	0xffffffff8227e000-0xffffffff82400000    1544K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

Based on work by PaX Team and Brad Spengler.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-7-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vdso2c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987556ce..3f69326ed545 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	fprintf(outfile, "#include <asm/vdso.h>\n");
 	fprintf(outfile, "\n");
 	fprintf(outfile,
-		"static unsigned char raw_data[%lu] __page_aligned_data = {",
+		"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
 		mapping_size);
 	for (j = 0; j < stripped_len; j++) {
 		if (j % 10 == 0)
-- 
cgit 


From dd71a17b1193dd4a4c35ecd0ba227aac3d110836 Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Tue, 23 Feb 2016 01:29:58 +0000
Subject: x86/platform/intel/quark: Change the kernel's IMR lock bit to false

Currently when setting up an IMR around the kernel's .text section we lock
that IMR, preventing further modification. While superficially this appears
to be the right thing to do, in fact this doesn't account for a legitimate
change in the memory map such as when executing a new kernel via kexec.

In such a scenario a second kernel can have a different size and location
to it's predecessor and can view some of the memory occupied by it's
predecessor as legitimately usable DMA RAM. If this RAM were then
subsequently allocated to DMA agents within the system it could conceivably
trigger an IMR violation.

This patch fixes the this potential situation by keeping the kernel's .text
section IMR lock bit false by default.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boon.leong.ong@intel.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1456190999-12685-2-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index c61b6c332e97..bfadcd0f4944 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -592,14 +592,14 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
 	end = (unsigned long)__end_rodata - 1;
 
 	/*
-	 * Setup a locked IMR around the physical extent of the kernel
+	 * Setup an unlocked IMR around the physical extent of the kernel
 	 * from the beginning of the .text secton to the end of the
 	 * .rodata section as one physically contiguous block.
 	 *
 	 * We don't round up @size since it is already PAGE_SIZE aligned.
 	 * See vmlinux.lds.S for details.
 	 */
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
 	if (ret < 0) {
 		pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
 			size / 1024, start, end);
-- 
cgit 


From c637fa5294cefeda8be73cce20ba6693d22262dc Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Tue, 23 Feb 2016 01:29:59 +0000
Subject: x86/platform/intel/quark: Drop IMR lock bit support

Isolated Memory Regions support a lock bit. The lock bit in an IMR prevents
modification of the IMR until the core goes through a warm or cold reset.
The lock bit feature is not useful in the context of the kernel API and is
not really necessary since modification of IMRs is possible only from
ring-zero anyway. This patch drops support for IMR locks bits, it
simplifies the kernel API and removes an unnecessary and needlessly complex
feature.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andriy.shevchenko@linux.intel.com
Cc: boon.leong.ong@intel.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1456190999-12685-3-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/imr.h                   |  2 +-
 arch/x86/platform/intel-quark/imr.c          | 24 ++++++------------------
 arch/x86/platform/intel-quark/imr_selftest.c | 15 +++++++--------
 3 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
index cd2ce4068441..ebea2c9d2cdc 100644
--- a/arch/x86/include/asm/imr.h
+++ b/arch/x86/include/asm/imr.h
@@ -53,7 +53,7 @@
 #define IMR_MASK		(IMR_ALIGN - 1)
 
 int imr_add_range(phys_addr_t base, size_t size,
-		  unsigned int rmask, unsigned int wmask, bool lock);
+		  unsigned int rmask, unsigned int wmask);
 
 int imr_remove_range(phys_addr_t base, size_t size);
 
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 740445a53363..17d6d2296e4d 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -134,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
  * @idev:	pointer to imr_device structure.
  * @imr_id:	IMR entry to write.
  * @imr:	IMR structure representing address and access masks.
- * @lock:	indicates if the IMR lock bit should be applied.
  * @return:	0 on success or error code passed from mbi_iosf on failure.
  */
-static int imr_write(struct imr_device *idev, u32 imr_id,
-		     struct imr_regs *imr, bool lock)
+static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
 {
 	unsigned long flags;
 	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
@@ -162,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
 	if (ret)
 		goto failed;
 
-	/* Lock bit must be set separately to addr_lo address bits. */
-	if (lock) {
-		imr->addr_lo |= IMR_LOCK;
-		ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
-				     reg - IMR_NUM_REGS, imr->addr_lo);
-		if (ret)
-			goto failed;
-	}
-
 	local_irq_restore(flags);
 	return 0;
 failed:
@@ -322,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
  * @size:	physical size of region in bytes must be aligned to 1KiB.
  * @read_mask:	read access mask.
  * @write_mask:	write access mask.
- * @lock:	indicates whether or not to permanently lock this region.
  * @return:	zero on success or negative value indicating error.
  */
 int imr_add_range(phys_addr_t base, size_t size,
-		  unsigned int rmask, unsigned int wmask, bool lock)
+		  unsigned int rmask, unsigned int wmask)
 {
 	phys_addr_t end;
 	unsigned int i;
@@ -399,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size,
 	imr.rmask = rmask;
 	imr.wmask = wmask;
 
-	ret = imr_write(idev, reg, &imr, lock);
+	ret = imr_write(idev, reg, &imr);
 	if (ret < 0) {
 		/*
 		 * In the highly unlikely event iosf_mbi_write failed
@@ -410,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size,
 		imr.addr_hi = 0;
 		imr.rmask = IMR_READ_ACCESS_ALL;
 		imr.wmask = IMR_WRITE_ACCESS_ALL;
-		imr_write(idev, reg, &imr, false);
+		imr_write(idev, reg, &imr);
 	}
 failed:
 	mutex_unlock(&idev->lock);
@@ -506,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
 	imr.rmask = IMR_READ_ACCESS_ALL;
 	imr.wmask = IMR_WRITE_ACCESS_ALL;
 
-	ret = imr_write(idev, reg, &imr, false);
+	ret = imr_write(idev, reg, &imr);
 
 failed:
 	mutex_unlock(&idev->lock);
@@ -587,7 +575,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
 	 * We don't round up @size since it is already PAGE_SIZE aligned.
 	 * See vmlinux.lds.S for details.
 	 */
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	if (ret < 0) {
 		pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
 			size / 1024, start, end);
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 0381343a0d3a..f5bad40936ac 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -60,30 +60,30 @@ static void __init imr_self_test(void)
 	int ret;
 
 	/* Test zero zero. */
-	ret = imr_add_range(0, 0, 0, 0, false);
+	ret = imr_add_range(0, 0, 0, 0);
 	imr_self_test_result(ret < 0, "zero sized IMR\n");
 
 	/* Test exact overlap. */
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test overlap with base inside of existing. */
 	base += size - IMR_ALIGN;
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test overlap with end inside of existing. */
 	base -= size + IMR_ALIGN * 2;
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
 	ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
-			    IMR_WRITE_ACCESS_ALL, false);
+			    IMR_WRITE_ACCESS_ALL);
 	imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
 
 	/* Test that a 1 KiB IMR @ zero with CPU only will work. */
-	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
 	if (ret >= 0) {
 		ret = imr_remove_range(0, IMR_ALIGN);
@@ -92,8 +92,7 @@ static void __init imr_self_test(void)
 
 	/* Test 2 KiB works. */
 	size = IMR_ALIGN * 2;
-	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
-			    IMR_WRITE_ACCESS_ALL, false);
+	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
 	imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
 	if (ret >= 0) {
 		ret = imr_remove_range(0, size);
-- 
cgit 


From ded5874946baa15b56b531f638f9c706266deb7b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Mon, 22 Feb 2016 17:23:40 +0900
Subject: KVM: x86: MMU: Consolidate quickly_check_mmio_pf() and
 is_mmio_page_fault()

These two have only slight differences:
 - whether 'addr' is of type u64 or of type gva_t
 - whether they have 'direct' parameter or not

Concerning the former, quickly_check_mmio_pf()'s u64 is better because
'addr' needs to be able to have both a guest physical address and a
guest virtual address.

The latter is just a stylistic issue as we can always calculate the mode
from the 'vcpu' as is_mmio_page_fault() does.  This patch keeps the
parameter to make the following patch cleaner.

In addition, the patch renames the function to mmio_info_in_cache() to
make it clear what it actually checks for.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955de5964..a28b734774ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3273,7 +3273,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
 	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
 }
 
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
 	if (direct)
 		return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3332,7 +3332,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 	u64 spte;
 	bool reserved;
 
-	if (quickly_check_mmio_pf(vcpu, addr, direct))
+	if (mmio_info_in_cache(vcpu, addr, direct))
 		return RET_MMIO_PF_EMULATE;
 
 	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@ -4354,19 +4354,12 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
-	if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-		return vcpu_match_mmio_gpa(vcpu, addr);
-
-	return vcpu_match_mmio_gva(vcpu, addr);
-}
-
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
+	bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
 
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
@@ -4377,7 +4370,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		goto out;
 	}
 
-	if (is_mmio_page_fault(vcpu, cr2))
+	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
 
 	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
-- 
cgit 


From e9ee956e311d3d0a1506995b98e8de0b30773e1a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Mon, 22 Feb 2016 17:23:41 +0900
Subject: KVM: x86: MMU: Move handle_mmio_page_fault() call to
 kvm_mmu_page_fault()

Rather than placing a handle_mmio_page_fault() call in each
vcpu->arch.mmu.page_fault() handler, moving it up to
kvm_mmu_page_fault() makes the code better:

 - avoids code duplication
 - for kvm_arch_async_page_ready(), which is the other caller of
   vcpu->arch.mmu.page_fault(), removes an extra error_code check
 - avoids returning both RET_MMIO_PF_* values and raw integer values
   from vcpu->arch.mmu.page_fault()

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 39 ++++++++++++++++-----------------------
 arch/x86/kvm/paging_tmpl.h | 19 ++++++-------------
 2 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a28b734774ac..2ce389245bd8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3370,13 +3370,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gva, true);
-
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-	}
-
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -3460,13 +3453,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gpa, true);
-
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-	}
-
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -4361,18 +4347,27 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	enum emulation_result er;
 	bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
 
+	if (unlikely(error_code & PFERR_RSVD_MASK)) {
+		r = handle_mmio_page_fault(vcpu, cr2, direct);
+		if (r == RET_MMIO_PF_EMULATE) {
+			emulation_type = 0;
+			goto emulate;
+		}
+		if (r == RET_MMIO_PF_RETRY)
+			return 1;
+		if (r < 0)
+			return r;
+	}
+
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
-		goto out;
-
-	if (!r) {
-		r = 1;
-		goto out;
-	}
+		return r;
+	if (!r)
+		return 1;
 
 	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
-
+emulate:
 	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
 	switch (er) {
@@ -4386,8 +4381,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	default:
 		BUG();
 	}
-out:
-	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c9fed957cce..05827ff7bd2e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -702,23 +702,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-
-		/*
-		 * page fault with PFEC.RSVD  = 1 is caused by shadow
-		 * page fault, should not be used to walk guest page
-		 * table.
-		 */
-		error_code &= ~PFERR_RSVD_MASK;
-	};
-
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
 
+	/*
+	 * If PFEC.RSVD is set, this is a shadow page fault.
+	 * The bit needs to be cleared before walking guest page tables.
+	 */
+	error_code &= ~PFERR_RSVD_MASK;
+
 	/*
 	 * Look up the guest pte for the faulting address.
 	 */
-- 
cgit 


From 652fc08dae8e01574cd2497d9a76948e5c9d3920 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Fri, 1 Jan 2016 19:47:13 +0800
Subject: KVM: x86: use list_for_each_entry*

Use list_for_each_entry*() instead of list_for_each*() to simplify
the code.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/assigned-dev.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 9dc091acd5fb..308b8597c691 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel {
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
-	struct list_head *ptr;
 	struct kvm_assigned_dev_kernel *match;
 
-	list_for_each(ptr, head) {
-		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+	list_for_each_entry(match, head, list) {
 		if (match->assigned_dev_id == assigned_dev_id)
 			return match;
 	}
@@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 
 void kvm_free_all_assigned_devices(struct kvm *kvm)
 {
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
+	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
 
+	list_for_each_entry_safe(assigned_dev, tmp,
+				 &kvm->arch.assigned_dev_head, list) {
 		kvm_free_assigned_device(kvm, assigned_dev);
 	}
 }
-- 
cgit 


From d74c0e6b54c95ace01f05264a22aed99b565fabb Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Fri, 1 Jan 2016 19:47:14 +0800
Subject: KVM: x86: use list_last_entry

To make the intention clearer, use list_last_entry instead of
list_entry.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 arch/x86/kvm/vmx.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2ce389245bd8..07f4c26a10d3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2354,8 +2354,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
 	if (list_empty(&kvm->arch.active_mmu_pages))
 		return false;
 
-	sp = list_entry(kvm->arch.active_mmu_pages.prev,
-			struct kvm_mmu_page, link);
+	sp = list_last_entry(&kvm->arch.active_mmu_pages,
+			     struct kvm_mmu_page, link);
 	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
 	return true;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9f08037ef14b..aa16d5874fe6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6445,8 +6445,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 
 	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
 		/* Recycle the least recently used VMCS. */
-		item = list_entry(vmx->nested.vmcs02_pool.prev,
-			struct vmcs02_list, list);
+		item = list_last_entry(&vmx->nested.vmcs02_pool,
+				       struct vmcs02_list, list);
 		item->vmptr = vmx->nested.current_vmptr;
 		list_move(&item->list, &vmx->nested.vmcs02_pool);
 		return &item->vmcs02;
-- 
cgit 


From de9e478b9d49f3a0214310d921450cf5bb4a21e6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Feb 2016 14:58:52 -0800
Subject: x86: fix SMAP in 32-bit environments

In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space
accesses") I changed how the stac/clac instructions were generated
around the user space accesses, which then made it possible to do
batched accesses efficiently for user string copies etc.

However, in doing so, I completely spaced out, and didn't even think
about the 32-bit case.  And nobody really even seemed to notice, because
SMAP doesn't even exist until modern Skylake processors, and you'd have
to be crazy to run 32-bit kernels on a modern CPU.

Which brings us to Andy Lutomirski.

He actually tested the 32-bit kernel on new hardware, and noticed that
it doesn't work.  My bad.  The trivial fix is to add the required
uaccess begin/end markers around the raw accesses in <asm/uaccess_32.h>.

I feel a bit bad about this patch, just because that header file really
should be cleaned up to avoid all the duplicated code in it, and this
commit just expands on the problem.  But this just fixes the bug without
any bigger cleanup surgery.

Reported-and-tested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index f5dcb5204dcd..3fe0eac59462 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__put_user_size(*(u8 *)from, (u8 __user *)to,
 					1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__put_user_size(*(u16 *)from, (u16 __user *)to,
 					2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
+			__uaccess_end();
 			return ret;
 		case 8:
+			__uaccess_begin();
 			__put_user_size(*(u64 *)from, (u64 __user *)to,
 					8, ret, 8);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
@@ -170,13 +190,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 		switch (n) {
 		case 1:
+			__uaccess_begin();
 			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			__uaccess_end();
 			return ret;
 		case 2:
+			__uaccess_begin();
 			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			__uaccess_end();
 			return ret;
 		case 4:
+			__uaccess_begin();
 			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			__uaccess_end();
 			return ret;
 		}
 	}
-- 
cgit 


From 0e8e2238b52e5301d1d1d4a298ec5b72ac54c702 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:09 -0600
Subject: x86/xen: Add stack frame dependency to hypercall inline asm calls

If a hypercall is inlined at the beginning of a function, gcc can insert
the call instruction before setting up a stack frame, which breaks frame
pointer convention if CONFIG_FRAME_POINTER is enabled and can result in
a bad stack trace.

Force a stack frame to be created if CONFIG_FRAME_POINTER is enabled by
listing the stack pointer as an output operand for the hypercall inline
asm statements.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/c6face5a46713108bded9c4c103637222abc4528.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/xen/hypercall.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3bcdcc84259d..a12a047184ee 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -110,9 +110,10 @@ extern struct { char _entry[32]; } hypercall_page[];
 	register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
 	register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
 	register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
-	register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+	register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \
+	register void *__sp asm(_ASM_SP);
 
-#define __HYPERCALL_0PARAM	"=r" (__res)
+#define __HYPERCALL_0PARAM	"=r" (__res), "+r" (__sp)
 #define __HYPERCALL_1PARAM	__HYPERCALL_0PARAM, "+r" (__arg1)
 #define __HYPERCALL_2PARAM	__HYPERCALL_1PARAM, "+r" (__arg2)
 #define __HYPERCALL_3PARAM	__HYPERCALL_2PARAM, "+r" (__arg3)
-- 
cgit 


From 9fd216067d75b45eb18d84dc476059de47da07c2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:10 -0600
Subject: x86/asm/xen: Set ELF function type for xen_adjust_exception_frame()

xen_adjust_exception_frame() is a callable function, but is missing the
ELF function type, which confuses tools like stacktool.

Properly annotate it to be a callable function.  The generated code is
unchanged.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/b1851bd17a0986472692a7e3a05290d891382cdd.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/xen-asm_64.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index cc8acc410ddb..c3df43141e70 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,6 +26,7 @@ ENTRY(xen_adjust_exception_frame)
 	mov 8+0(%rsp), %rcx
 	mov 8+8(%rsp), %r11
 	ret $16
+ENDPROC(xen_adjust_exception_frame)
 
 hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
 /*
-- 
cgit 


From 8be0eb7e0d53bc2dbe6e9ad6e96a2d5b89d148a8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:11 -0600
Subject: x86/asm/xen: Create stack frames in xen-asm.S

xen_irq_enable_direct(), xen_restore_fl_direct(), and check_events() are
callable non-leaf functions which don't honor CONFIG_FRAME_POINTER,
which can result in bad stack traces.

Create stack frames for them when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/a8340ad3fc72ba9ed34da9b3af9cdd6f1a896e17.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/xen-asm.S | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 3e45aa000718..eff224df813f 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -14,6 +14,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
+#include <asm/frame.h>
 
 #include "xen-asm.h"
 
@@ -23,6 +24,7 @@
  * then enter the hypervisor to get them handled.
  */
 ENTRY(xen_irq_enable_direct)
+	FRAME_BEGIN
 	/* Unmask events */
 	movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
 
@@ -39,6 +41,7 @@ ENTRY(xen_irq_enable_direct)
 2:	call check_events
 1:
 ENDPATCH(xen_irq_enable_direct)
+	FRAME_END
 	ret
 	ENDPROC(xen_irq_enable_direct)
 	RELOC(xen_irq_enable_direct, 2b+1)
@@ -82,6 +85,7 @@ ENDPATCH(xen_save_fl_direct)
  * enters the hypervisor to get them delivered if so.
  */
 ENTRY(xen_restore_fl_direct)
+	FRAME_BEGIN
 #ifdef CONFIG_X86_64
 	testw $X86_EFLAGS_IF, %di
 #else
@@ -100,6 +104,7 @@ ENTRY(xen_restore_fl_direct)
 2:	call check_events
 1:
 ENDPATCH(xen_restore_fl_direct)
+	FRAME_END
 	ret
 	ENDPROC(xen_restore_fl_direct)
 	RELOC(xen_restore_fl_direct, 2b+1)
@@ -109,7 +114,8 @@ ENDPATCH(xen_restore_fl_direct)
  * Force an event check by making a hypercall, but preserve regs
  * before making the call.
  */
-check_events:
+ENTRY(check_events)
+	FRAME_BEGIN
 #ifdef CONFIG_X86_32
 	push %eax
 	push %ecx
@@ -139,4 +145,6 @@ check_events:
 	pop %rcx
 	pop %rax
 #endif
+	FRAME_END
 	ret
+ENDPROC(check_events)
-- 
cgit 


From bb93eb4cd606888f879897e8289d9c1143571feb Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:12 -0600
Subject: x86/paravirt: Add stack frame dependency to PVOP inline asm calls

If a PVOP call macro is inlined at the beginning of a function, gcc can
insert the call instruction before setting up a stack frame, which
breaks frame pointer convention if CONFIG_FRAME_POINTER is enabled and
can result in a bad stack trace.

Force a stack frame to be created if CONFIG_FRAME_POINTER is enabled by
listing the stack pointer as an output operand for the PVOP inline asm
statements.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/6a13e48c5a8cf2de1aa112ae2d4c0ac194096282.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/paravirt_types.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 77db5616a473..e8c2326478c8 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -466,8 +466,9 @@ int paravirt_disable_iospace(void);
  * makes sure the incoming and outgoing types are always correct.
  */
 #ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS				\
-	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
+#define PVOP_VCALL_ARGS							\
+	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;	\
+	register void *__sp asm("esp")
 #define PVOP_CALL_ARGS			PVOP_VCALL_ARGS
 
 #define PVOP_CALL_ARG1(x)		"a" ((unsigned long)(x))
@@ -485,9 +486,10 @@ int paravirt_disable_iospace(void);
 #define VEXTRA_CLOBBERS
 #else  /* CONFIG_X86_64 */
 /* [re]ax isn't an arg, but the return val */
-#define PVOP_VCALL_ARGS					\
-	unsigned long __edi = __edi, __esi = __esi,	\
-		__edx = __edx, __ecx = __ecx, __eax = __eax
+#define PVOP_VCALL_ARGS						\
+	unsigned long __edi = __edi, __esi = __esi,		\
+		__edx = __edx, __ecx = __ecx, __eax = __eax;	\
+	register void *__sp asm("rsp")
 #define PVOP_CALL_ARGS		PVOP_VCALL_ARGS
 
 #define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
@@ -526,7 +528,7 @@ int paravirt_disable_iospace(void);
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : call_clbr			\
+				     : call_clbr, "+r" (__sp)		\
 				     : paravirt_type(op),		\
 				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
@@ -536,7 +538,7 @@ int paravirt_disable_iospace(void);
 			asm volatile(pre				\
 				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
-				     : call_clbr			\
+				     : call_clbr, "+r" (__sp)		\
 				     : paravirt_type(op),		\
 				       paravirt_clobber(clbr),		\
 				       ##__VA_ARGS__			\
@@ -563,7 +565,7 @@ int paravirt_disable_iospace(void);
 		asm volatile(pre					\
 			     paravirt_alt(PARAVIRT_CALL)		\
 			     post					\
-			     : call_clbr				\
+			     : call_clbr, "+r" (__sp)			\
 			     : paravirt_type(op),			\
 			       paravirt_clobber(clbr),			\
 			       ##__VA_ARGS__				\
-- 
cgit 


From 87b240cbe3e51bf070fe2839fecb6450323aaef4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:13 -0600
Subject: x86/paravirt: Create a stack frame in PV_CALLEE_SAVE_REGS_THUNK

A function created with the PV_CALLEE_SAVE_REGS_THUNK macro doesn't set
up a new stack frame before the call instruction, which breaks frame
pointer convention if CONFIG_FRAME_POINTER is enabled and can result in
a bad stack trace.  Also, the thunk functions aren't annotated as ELF
callable functions.

Create a stack frame when CONFIG_FRAME_POINTER is enabled and add the
ELF function type.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/a2cad74e87c4aba7fd0f54a1af312e66a824a575.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/paravirt.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f6192502149e..601f1b8f9961 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -13,6 +13,7 @@
 #include <linux/bug.h>
 #include <linux/types.h>
 #include <linux/cpumask.h>
+#include <asm/frame.h>
 
 static inline int paravirt_enabled(void)
 {
@@ -756,15 +757,19 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
  * call. The return value in rax/eax will not be saved, even for void
  * functions.
  */
+#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
 #define PV_CALLEE_SAVE_REGS_THUNK(func)					\
 	extern typeof(func) __raw_callee_save_##func;			\
 									\
 	asm(".pushsection .text;"					\
-	    ".globl __raw_callee_save_" #func " ; "			\
-	    "__raw_callee_save_" #func ": "				\
+	    ".globl " PV_THUNK_NAME(func) ";"				\
+	    ".type " PV_THUNK_NAME(func) ", @function;"			\
+	    PV_THUNK_NAME(func) ":"					\
+	    FRAME_BEGIN							\
 	    PV_SAVE_ALL_CALLER_REGS					\
 	    "call " #func ";"						\
 	    PV_RESTORE_ALL_CALLER_REGS					\
+	    FRAME_END							\
 	    "ret;"							\
 	    ".popsection")
 
-- 
cgit 


From de642faf48670c3c8eae5899177f786c624f4894 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:14 -0600
Subject: x86/amd: Set ELF function type for vide()

vide() is a callable function, but is missing the ELF function type,
which confuses tools like stacktool.

Properly annotate it to be a callable function.  The generated code is
unchanged.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/a324095f5c9390ff39b15b4562ea1bbeda1a8282.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a07956a08936..fe2f089f03d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -75,7 +75,10 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
  */
 
 extern __visible void vide(void);
-__asm__(".globl vide\n\t.align 4\nvide: ret");
+__asm__(".globl vide\n"
+	".type vide, @function\n"
+	".align 4\n"
+	"vide: ret\n");
 
 static void init_amd_k5(struct cpuinfo_x86 *c)
 {
-- 
cgit 


From 1253cab8a35278c0493a48aa5ab74992f7a849ea Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:15 -0600
Subject: x86/asm/crypto: Move .Lbswap_mask data to .rodata section

stacktool reports the following warning:

  stacktool: arch/x86/crypto/aesni-intel_asm.o: _aesni_inc_init(): can't find starting instruction

stacktool gets confused when it tries to disassemble the following data
in the .text section:

  .Lbswap_mask:
          .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

Move it to .rodata which is a more appropriate section for read-only
data.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/b6a2f3f8bda705143e127c025edb2b53c86e6eb4.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/aesni-intel_asm.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 6bd2c6c95373..c44cfedbe1c3 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2538,9 +2538,11 @@ ENTRY(aesni_cbc_dec)
 ENDPROC(aesni_cbc_dec)
 
 #ifdef __x86_64__
+.pushsection .rodata
 .align 16
 .Lbswap_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.popsection
 
 /*
  * _aesni_inc_init:	internal ABI
-- 
cgit 


From f66f61919eb38c5f20e8a978cae7ecdede4c23b9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:16 -0600
Subject: x86/asm/crypto: Move jump_table to .rodata section

stacktool reports the following warning:

  stacktool: arch/x86/crypto/crc32c-pcl-intel-asm_64.o: crc_pcl()+0x11dd: can't decode instruction

It gets confused when trying to decode jump_table data.  Move jump_table
to the .rodata section which is a more appropriate home for read-only
data.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/1dbf80c097bb9d89c0cbddc01a815ada690e3b32.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 4fe27e074194..dc05f010ca9b 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -170,8 +170,8 @@ continue_block:
 	## branch into array
 	lea	jump_table(%rip), bufp
 	movzxw  (bufp, %rax, 2), len
-	offset=crc_array-jump_table
-	lea     offset(bufp, len, 1), bufp
+	lea	crc_array(%rip), bufp
+	lea     (bufp, len, 1), bufp
 	jmp     *bufp
 
 	################################################################
@@ -310,7 +310,9 @@ do_return:
 	popq    %rdi
 	popq    %rbx
         ret
+ENDPROC(crc_pcl)
 
+.section	.rodata, "a", %progbits
         ################################################################
         ## jump table        Table is 129 entries x 2 bytes each
         ################################################################
@@ -324,13 +326,11 @@ JMPTBL_ENTRY %i
 	i=i+1
 .endr
 
-ENDPROC(crc_pcl)
 
 	################################################################
 	## PCLMULQDQ tables
 	## Table is 128 entries x 2 words (8 bytes) each
 	################################################################
-.section	.rodata, "a", %progbits
 .align 8
 K_table:
 	.long 0x493c7d27, 0x00000001
-- 
cgit 


From aec4d0e301f17bb143341c82cc44685b8af0b945 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:17 -0600
Subject: x86/asm/crypto: Simplify stack usage in sha-mb functions

sha1_mb_mgr_flush_avx2() and sha1_mb_mgr_submit_avx2() both allocate a
lot of stack space which is never used.  Also, many of the registers
being saved aren't being clobbered so there's no need to save them.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/9402e4d87580d6b2376ed95f67b84bdcce3c830e.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S  | 32 ++----------------------
 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S | 29 +++------------------
 2 files changed, 6 insertions(+), 55 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
index 85c4e1cf7172..672eaeb258a5 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -86,16 +86,6 @@
 #define extra_blocks    %arg2
 #define p               %arg2
 
-
-# STACK_SPACE needs to be an odd multiple of 8
-_XMM_SAVE_SIZE  = 10*16
-_GPR_SAVE_SIZE  = 8*8
-_ALIGN_SIZE     = 8
-
-_XMM_SAVE       = 0
-_GPR_SAVE       = _XMM_SAVE + _XMM_SAVE_SIZE
-STACK_SPACE     = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
-
 .macro LABEL prefix n
 \prefix\n\():
 .endm
@@ -113,16 +103,7 @@ offset = \_offset
 # JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
 # arg 1 : rcx : state
 ENTRY(sha1_mb_mgr_flush_avx2)
-	mov	%rsp, %r10
-	sub     $STACK_SPACE, %rsp
-	and     $~31, %rsp
-	mov     %rbx, _GPR_SAVE(%rsp)
-	mov     %r10, _GPR_SAVE+8*1(%rsp) #save rsp
-	mov	%rbp, _GPR_SAVE+8*3(%rsp)
-	mov	%r12, _GPR_SAVE+8*4(%rsp)
-	mov	%r13, _GPR_SAVE+8*5(%rsp)
-	mov	%r14, _GPR_SAVE+8*6(%rsp)
-	mov	%r15, _GPR_SAVE+8*7(%rsp)
+	push	%rbx
 
 	# If bit (32+3) is set, then all lanes are empty
 	mov     _unused_lanes(state), unused_lanes
@@ -230,16 +211,7 @@ len_is_0:
 	mov     tmp2_w, offset(job_rax)
 
 return:
-
-	mov     _GPR_SAVE(%rsp), %rbx
-	mov     _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
-	mov	_GPR_SAVE+8*3(%rsp), %rbp
-	mov	_GPR_SAVE+8*4(%rsp), %r12
-	mov	_GPR_SAVE+8*5(%rsp), %r13
-	mov	_GPR_SAVE+8*6(%rsp), %r14
-	mov	_GPR_SAVE+8*7(%rsp), %r15
-	mov     %r10, %rsp
-
+	pop	%rbx
 	ret
 
 return_null:
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index 2ab9560b53c8..a5a14c62d843 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -94,25 +94,12 @@ DWORD_tmp	= %r9d
 
 lane_data       = %r10
 
-# STACK_SPACE needs to be an odd multiple of 8
-STACK_SPACE     = 8*8 + 16*10 + 8
-
 # JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
 # arg 1 : rcx : state
 # arg 2 : rdx : job
 ENTRY(sha1_mb_mgr_submit_avx2)
-
-	mov	%rsp, %r10
-	sub     $STACK_SPACE, %rsp
-	and	$~31, %rsp
-
-	mov     %rbx, (%rsp)
-	mov	%r10, 8*2(%rsp)	#save old rsp
-	mov     %rbp, 8*3(%rsp)
-	mov	%r12, 8*4(%rsp)
-	mov	%r13, 8*5(%rsp)
-	mov	%r14, 8*6(%rsp)
-	mov	%r15, 8*7(%rsp)
+	push	%rbx
+	push	%rbp
 
 	mov     _unused_lanes(state), unused_lanes
 	mov	unused_lanes, lane
@@ -203,16 +190,8 @@ len_is_0:
 	movl    DWORD_tmp, _result_digest+1*16(job_rax)
 
 return:
-
-	mov     (%rsp), %rbx
-	mov	8*2(%rsp), %r10	#save old rsp
-	mov     8*3(%rsp), %rbp
-	mov	8*4(%rsp), %r12
-	mov	8*5(%rsp), %r13
-	mov	8*6(%rsp), %r14
-	mov	8*7(%rsp), %r15
-	mov     %r10, %rsp
-
+	pop	%rbp
+	pop	%rbx
 	ret
 
 return_null:
-- 
cgit 


From 68874ac3304ade7ed5ebb12af00d6b9bbbca0a16 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:18 -0600
Subject: x86/asm/crypto: Don't use RBP as a scratch register

The frame pointer (RBP) is getting clobbered in
sha1_mb_mgr_submit_avx2() before a function call, which can mess up
stack traces.  Use R12 instead.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/15a3eb7ebe68e37755927915f45e4f0bde4d18c5.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index a5a14c62d843..c3b944781d3f 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -86,8 +86,8 @@ job_rax         = %rax
 len             = %rax
 DWORD_len	= %eax
 
-lane            = %rbp
-tmp3            = %rbp
+lane            = %r12
+tmp3            = %r12
 
 tmp             = %r9
 DWORD_tmp	= %r9d
@@ -99,7 +99,7 @@ lane_data       = %r10
 # arg 2 : rdx : job
 ENTRY(sha1_mb_mgr_submit_avx2)
 	push	%rbx
-	push	%rbp
+	push	%r12
 
 	mov     _unused_lanes(state), unused_lanes
 	mov	unused_lanes, lane
@@ -190,7 +190,7 @@ len_is_0:
 	movl    DWORD_tmp, _result_digest+1*16(job_rax)
 
 return:
-	pop	%rbp
+	pop	%r12
 	pop	%rbx
 	ret
 
-- 
cgit 


From 8691ccd764f9ecc69a6812dfe76214c86ac9ba06 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:19 -0600
Subject: x86/asm/crypto: Create stack frames in crypto functions

The crypto code has several callable non-leaf functions which don't
honor CONFIG_FRAME_POINTER, which can result in bad stack traces.

Create stack frames for them when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/6c20192bcf1102ae18ae5a242cabf30ce9b29895.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/aesni-intel_asm.S                | 73 +++++++++++++++---------
 arch/x86/crypto/camellia-aesni-avx-asm_64.S      | 15 +++++
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S     | 15 +++++
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S        |  9 +++
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S        | 13 +++++
 arch/x86/crypto/ghash-clmulni-intel_asm.S        |  5 ++
 arch/x86/crypto/serpent-avx-x86_64-asm_64.S      | 13 +++++
 arch/x86/crypto/serpent-avx2-asm_64.S            | 13 +++++
 arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S  |  3 +
 arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S |  3 +
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S      | 13 +++++
 11 files changed, 148 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index c44cfedbe1c3..383a6f84a060 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -31,6 +31,7 @@
 
 #include <linux/linkage.h>
 #include <asm/inst.h>
+#include <asm/frame.h>
 
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -1800,11 +1801,12 @@ ENDPROC(_key_expansion_256b)
  *                   unsigned int key_len)
  */
 ENTRY(aesni_set_key)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl KEYP
-	movl 8(%esp), KEYP		# ctx
-	movl 12(%esp), UKEYP		# in_key
-	movl 16(%esp), %edx		# key_len
+	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
+	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
 #endif
 	movups (UKEYP), %xmm0		# user key (first 16 bytes)
 	movaps %xmm0, (KEYP)
@@ -1905,6 +1907,7 @@ ENTRY(aesni_set_key)
 #ifndef __x86_64__
 	popl KEYP
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_set_key)
 
@@ -1912,12 +1915,13 @@ ENDPROC(aesni_set_key)
  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_enc)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl KEYP
 	pushl KLEN
-	movl 12(%esp), KEYP
-	movl 16(%esp), OUTP
-	movl 20(%esp), INP
+	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+20)(%esp), INP	# src
 #endif
 	movl 480(KEYP), KLEN		# key length
 	movups (INP), STATE		# input
@@ -1927,6 +1931,7 @@ ENTRY(aesni_enc)
 	popl KLEN
 	popl KEYP
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_enc)
 
@@ -2101,12 +2106,13 @@ ENDPROC(_aesni_enc4)
  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_dec)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl KEYP
 	pushl KLEN
-	movl 12(%esp), KEYP
-	movl 16(%esp), OUTP
-	movl 20(%esp), INP
+	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+20)(%esp), INP	# src
 #endif
 	mov 480(KEYP), KLEN		# key length
 	add $240, KEYP
@@ -2117,6 +2123,7 @@ ENTRY(aesni_dec)
 	popl KLEN
 	popl KEYP
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_dec)
 
@@ -2292,14 +2299,15 @@ ENDPROC(_aesni_dec4)
  *		      size_t len)
  */
 ENTRY(aesni_ecb_enc)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl LEN
 	pushl KEYP
 	pushl KLEN
-	movl 16(%esp), KEYP
-	movl 20(%esp), OUTP
-	movl 24(%esp), INP
-	movl 28(%esp), LEN
+	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+24)(%esp), INP	# src
+	movl (FRAME_OFFSET+28)(%esp), LEN	# len
 #endif
 	test LEN, LEN		# check length
 	jz .Lecb_enc_ret
@@ -2342,6 +2350,7 @@ ENTRY(aesni_ecb_enc)
 	popl KEYP
 	popl LEN
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_ecb_enc)
 
@@ -2350,14 +2359,15 @@ ENDPROC(aesni_ecb_enc)
  *		      size_t len);
  */
 ENTRY(aesni_ecb_dec)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl LEN
 	pushl KEYP
 	pushl KLEN
-	movl 16(%esp), KEYP
-	movl 20(%esp), OUTP
-	movl 24(%esp), INP
-	movl 28(%esp), LEN
+	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+24)(%esp), INP	# src
+	movl (FRAME_OFFSET+28)(%esp), LEN	# len
 #endif
 	test LEN, LEN
 	jz .Lecb_dec_ret
@@ -2401,6 +2411,7 @@ ENTRY(aesni_ecb_dec)
 	popl KEYP
 	popl LEN
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_ecb_dec)
 
@@ -2409,16 +2420,17 @@ ENDPROC(aesni_ecb_dec)
  *		      size_t len, u8 *iv)
  */
 ENTRY(aesni_cbc_enc)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl IVP
 	pushl LEN
 	pushl KEYP
 	pushl KLEN
-	movl 20(%esp), KEYP
-	movl 24(%esp), OUTP
-	movl 28(%esp), INP
-	movl 32(%esp), LEN
-	movl 36(%esp), IVP
+	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+28)(%esp), INP	# src
+	movl (FRAME_OFFSET+32)(%esp), LEN	# len
+	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 #endif
 	cmp $16, LEN
 	jb .Lcbc_enc_ret
@@ -2443,6 +2455,7 @@ ENTRY(aesni_cbc_enc)
 	popl LEN
 	popl IVP
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_cbc_enc)
 
@@ -2451,16 +2464,17 @@ ENDPROC(aesni_cbc_enc)
  *		      size_t len, u8 *iv)
  */
 ENTRY(aesni_cbc_dec)
+	FRAME_BEGIN
 #ifndef __x86_64__
 	pushl IVP
 	pushl LEN
 	pushl KEYP
 	pushl KLEN
-	movl 20(%esp), KEYP
-	movl 24(%esp), OUTP
-	movl 28(%esp), INP
-	movl 32(%esp), LEN
-	movl 36(%esp), IVP
+	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+28)(%esp), INP	# src
+	movl (FRAME_OFFSET+32)(%esp), LEN	# len
+	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 #endif
 	cmp $16, LEN
 	jb .Lcbc_dec_just_ret
@@ -2534,6 +2548,7 @@ ENTRY(aesni_cbc_dec)
 	popl LEN
 	popl IVP
 #endif
+	FRAME_END
 	ret
 ENDPROC(aesni_cbc_dec)
 
@@ -2600,6 +2615,7 @@ ENDPROC(_aesni_inc)
  *		      size_t len, u8 *iv)
  */
 ENTRY(aesni_ctr_enc)
+	FRAME_BEGIN
 	cmp $16, LEN
 	jb .Lctr_enc_just_ret
 	mov 480(KEYP), KLEN
@@ -2653,6 +2669,7 @@ ENTRY(aesni_ctr_enc)
 .Lctr_enc_ret:
 	movups IV, (IVP)
 .Lctr_enc_just_ret:
+	FRAME_END
 	ret
 ENDPROC(aesni_ctr_enc)
 
@@ -2679,6 +2696,7 @@ ENDPROC(aesni_ctr_enc)
  *			 bool enc, u8 *iv)
  */
 ENTRY(aesni_xts_crypt8)
+	FRAME_BEGIN
 	cmpb $0, %cl
 	movl $0, %ecx
 	movl $240, %r10d
@@ -2779,6 +2797,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu STATE4, 0x70(OUTP)
 
+	FRAME_END
 	ret
 ENDPROC(aesni_xts_crypt8)
 
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index ce71f9212409..aa9e8bd163f6 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -16,6 +16,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -726,6 +727,7 @@ __camellia_enc_blk16:
 	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	FRAME_BEGIN
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -780,6 +782,7 @@ __camellia_enc_blk16:
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
 
+	FRAME_END
 	ret;
 
 .align 8
@@ -812,6 +815,7 @@ __camellia_dec_blk16:
 	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	FRAME_BEGIN
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -865,6 +869,7 @@ __camellia_dec_blk16:
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 
+	FRAME_END
 	ret;
 
 .align 8
@@ -890,6 +895,7 @@ ENTRY(camellia_ecb_enc_16way)
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 */
+	 FRAME_BEGIN
 
 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -904,6 +910,7 @@ ENTRY(camellia_ecb_enc_16way)
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ecb_enc_16way)
 
@@ -913,6 +920,7 @@ ENTRY(camellia_ecb_dec_16way)
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 */
+	 FRAME_BEGIN
 
 	cmpl $16, key_length(CTX);
 	movl $32, %r8d;
@@ -932,6 +940,7 @@ ENTRY(camellia_ecb_dec_16way)
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ecb_dec_16way)
 
@@ -941,6 +950,7 @@ ENTRY(camellia_cbc_dec_16way)
 	 *	%rsi: dst (16 blocks)
 	 *	%rdx: src (16 blocks)
 	 */
+	FRAME_BEGIN
 
 	cmpl $16, key_length(CTX);
 	movl $32, %r8d;
@@ -981,6 +991,7 @@ ENTRY(camellia_cbc_dec_16way)
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_cbc_dec_16way)
 
@@ -997,6 +1008,7 @@ ENTRY(camellia_ctr_16way)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	subq $(16 * 16), %rsp;
 	movq %rsp, %rax;
@@ -1092,6 +1104,7 @@ ENTRY(camellia_ctr_16way)
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ctr_16way)
 
@@ -1112,6 +1125,7 @@ camellia_xts_crypt_16way:
 	 *	%r8: index for input whitening key
 	 *	%r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
 	 */
+	FRAME_BEGIN
 
 	subq $(16 * 16), %rsp;
 	movq %rsp, %rax;
@@ -1234,6 +1248,7 @@ camellia_xts_crypt_16way:
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_xts_crypt_16way)
 
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0e0b8863a34b..16186c18656d 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -11,6 +11,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -766,6 +767,7 @@ __camellia_enc_blk32:
 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	FRAME_BEGIN
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -820,6 +822,7 @@ __camellia_enc_blk32:
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
 
+	FRAME_END
 	ret;
 
 .align 8
@@ -852,6 +855,7 @@ __camellia_dec_blk32:
 	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	FRAME_BEGIN
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -905,6 +909,7 @@ __camellia_dec_blk32:
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
 
+	FRAME_END
 	ret;
 
 .align 8
@@ -930,6 +935,7 @@ ENTRY(camellia_ecb_enc_32way)
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -948,6 +954,7 @@ ENTRY(camellia_ecb_enc_32way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ecb_enc_32way)
 
@@ -957,6 +964,7 @@ ENTRY(camellia_ecb_dec_32way)
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -980,6 +988,7 @@ ENTRY(camellia_ecb_dec_32way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ecb_dec_32way)
 
@@ -989,6 +998,7 @@ ENTRY(camellia_cbc_dec_32way)
 	 *	%rsi: dst (32 blocks)
 	 *	%rdx: src (32 blocks)
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -1046,6 +1056,7 @@ ENTRY(camellia_cbc_dec_32way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_cbc_dec_32way)
 
@@ -1070,6 +1081,7 @@ ENTRY(camellia_ctr_32way)
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -1184,6 +1196,7 @@ ENTRY(camellia_ctr_32way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_ctr_32way)
 
@@ -1216,6 +1229,7 @@ camellia_xts_crypt_32way:
 	 *	%r8: index for input whitening key
 	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -1349,6 +1363,7 @@ camellia_xts_crypt_32way:
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(camellia_xts_crypt_32way)
 
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index c35fd5d6ecd2..14fa1966bf01 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 .file "cast5-avx-x86_64-asm_64.S"
 
@@ -365,6 +366,7 @@ ENTRY(cast5_ecb_enc_16way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -388,6 +390,7 @@ ENTRY(cast5_ecb_enc_16way)
 	vmovdqu RR4, (6*4*4)(%r11);
 	vmovdqu RL4, (7*4*4)(%r11);
 
+	FRAME_END
 	ret;
 ENDPROC(cast5_ecb_enc_16way)
 
@@ -398,6 +401,7 @@ ENTRY(cast5_ecb_dec_16way)
 	 *	%rdx: src
 	 */
 
+	FRAME_BEGIN
 	movq %rsi, %r11;
 
 	vmovdqu (0*4*4)(%rdx), RL1;
@@ -420,6 +424,7 @@ ENTRY(cast5_ecb_dec_16way)
 	vmovdqu RR4, (6*4*4)(%r11);
 	vmovdqu RL4, (7*4*4)(%r11);
 
+	FRAME_END
 	ret;
 ENDPROC(cast5_ecb_dec_16way)
 
@@ -429,6 +434,7 @@ ENTRY(cast5_cbc_dec_16way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -469,6 +475,7 @@ ENTRY(cast5_cbc_dec_16way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(cast5_cbc_dec_16way)
 
@@ -479,6 +486,7 @@ ENTRY(cast5_ctr_16way)
 	 *	%rdx: src
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -542,5 +550,6 @@ ENTRY(cast5_ctr_16way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index e3531f833951..c419389889cd 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "glue_helper-asm-avx.S"
 
 .file "cast6-avx-x86_64-asm_64.S"
@@ -349,6 +350,7 @@ ENTRY(cast6_ecb_enc_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -358,6 +360,7 @@ ENTRY(cast6_ecb_enc_8way)
 
 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_ecb_enc_8way)
 
@@ -367,6 +370,7 @@ ENTRY(cast6_ecb_dec_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -376,6 +380,7 @@ ENTRY(cast6_ecb_dec_8way)
 
 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_ecb_dec_8way)
 
@@ -385,6 +390,7 @@ ENTRY(cast6_cbc_dec_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -399,6 +405,7 @@ ENTRY(cast6_cbc_dec_8way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_cbc_dec_8way)
 
@@ -409,6 +416,7 @@ ENTRY(cast6_ctr_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -424,6 +432,7 @@ ENTRY(cast6_ctr_8way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_ctr_8way)
 
@@ -434,6 +443,7 @@ ENTRY(cast6_xts_enc_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -446,6 +456,7 @@ ENTRY(cast6_xts_enc_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_xts_enc_8way)
 
@@ -456,6 +467,7 @@ ENTRY(cast6_xts_dec_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -468,5 +480,6 @@ ENTRY(cast6_xts_dec_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 5d1e0075ac24..eed55c8cca4f 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -18,6 +18,7 @@
 
 #include <linux/linkage.h>
 #include <asm/inst.h>
+#include <asm/frame.h>
 
 .data
 
@@ -94,6 +95,7 @@ ENDPROC(__clmul_gf128mul_ble)
 
 /* void clmul_ghash_mul(char *dst, const u128 *shash) */
 ENTRY(clmul_ghash_mul)
+	FRAME_BEGIN
 	movups (%rdi), DATA
 	movups (%rsi), SHASH
 	movaps .Lbswap_mask, BSWAP
@@ -101,6 +103,7 @@ ENTRY(clmul_ghash_mul)
 	call __clmul_gf128mul_ble
 	PSHUFB_XMM BSWAP DATA
 	movups DATA, (%rdi)
+	FRAME_END
 	ret
 ENDPROC(clmul_ghash_mul)
 
@@ -109,6 +112,7 @@ ENDPROC(clmul_ghash_mul)
  *			   const u128 *shash);
  */
 ENTRY(clmul_ghash_update)
+	FRAME_BEGIN
 	cmp $16, %rdx
 	jb .Lupdate_just_ret	# check length
 	movaps .Lbswap_mask, BSWAP
@@ -128,5 +132,6 @@ ENTRY(clmul_ghash_update)
 	PSHUFB_XMM BSWAP DATA
 	movups DATA, (%rdi)
 .Lupdate_just_ret:
+	FRAME_END
 	ret
 ENDPROC(clmul_ghash_update)
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 2f202f49872b..8be571808342 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "glue_helper-asm-avx.S"
 
 .file "serpent-avx-x86_64-asm_64.S"
@@ -681,6 +682,7 @@ ENTRY(serpent_ecb_enc_8way_avx)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
@@ -688,6 +690,7 @@ ENTRY(serpent_ecb_enc_8way_avx)
 
 	store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ecb_enc_8way_avx)
 
@@ -697,6 +700,7 @@ ENTRY(serpent_ecb_dec_8way_avx)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
@@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_8way_avx)
 
 	store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ecb_dec_8way_avx)
 
@@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_8way_avx)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
@@ -720,6 +726,7 @@ ENTRY(serpent_cbc_dec_8way_avx)
 
 	store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_cbc_dec_8way_avx)
 
@@ -730,6 +737,7 @@ ENTRY(serpent_ctr_8way_avx)
 	 *	%rdx: src
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 		      RD2, RK0, RK1, RK2);
@@ -738,6 +746,7 @@ ENTRY(serpent_ctr_8way_avx)
 
 	store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ctr_8way_avx)
 
@@ -748,6 +757,7 @@ ENTRY(serpent_xts_enc_8way_avx)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
 	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
@@ -758,6 +768,7 @@ ENTRY(serpent_xts_enc_8way_avx)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_xts_enc_8way_avx)
 
@@ -768,6 +779,7 @@ ENTRY(serpent_xts_dec_8way_avx)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
 	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
@@ -778,5 +790,6 @@ ENTRY(serpent_xts_dec_8way_avx)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index b222085cccac..97c48add33ed 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -15,6 +15,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "glue_helper-asm-avx2.S"
 
 .file "serpent-avx2-asm_64.S"
@@ -673,6 +674,7 @@ ENTRY(serpent_ecb_enc_16way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -684,6 +686,7 @@ ENTRY(serpent_ecb_enc_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ecb_enc_16way)
 
@@ -693,6 +696,7 @@ ENTRY(serpent_ecb_dec_16way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ecb_dec_16way)
 
@@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_16way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -725,6 +731,7 @@ ENTRY(serpent_cbc_dec_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_cbc_dec_16way)
 
@@ -735,6 +742,7 @@ ENTRY(serpent_ctr_16way)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -748,6 +756,7 @@ ENTRY(serpent_ctr_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_ctr_16way)
 
@@ -758,6 +767,7 @@ ENTRY(serpent_xts_enc_16way)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -772,6 +782,7 @@ ENTRY(serpent_xts_enc_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_xts_enc_16way)
 
@@ -782,6 +793,7 @@ ENTRY(serpent_xts_dec_16way)
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	vzeroupper;
 
@@ -796,5 +808,6 @@ ENTRY(serpent_xts_dec_16way)
 
 	vzeroupper;
 
+	FRAME_END
 	ret;
 ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
index 672eaeb258a5..96df6a39d7e2 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -52,6 +52,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "sha1_mb_mgr_datastruct.S"
 
 
@@ -103,6 +104,7 @@ offset = \_offset
 # JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
 # arg 1 : rcx : state
 ENTRY(sha1_mb_mgr_flush_avx2)
+	FRAME_BEGIN
 	push	%rbx
 
 	# If bit (32+3) is set, then all lanes are empty
@@ -212,6 +214,7 @@ len_is_0:
 
 return:
 	pop	%rbx
+	FRAME_END
 	ret
 
 return_null:
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index c3b944781d3f..1435acf46f22 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -53,6 +53,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "sha1_mb_mgr_datastruct.S"
 
 
@@ -98,6 +99,7 @@ lane_data       = %r10
 # arg 1 : rcx : state
 # arg 2 : rdx : job
 ENTRY(sha1_mb_mgr_submit_avx2)
+	FRAME_BEGIN
 	push	%rbx
 	push	%r12
 
@@ -192,6 +194,7 @@ len_is_0:
 return:
 	pop	%r12
 	pop	%rbx
+	FRAME_END
 	ret
 
 return_null:
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 05058134c443..dc66273e610d 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 #include "glue_helper-asm-avx.S"
 
 .file "twofish-avx-x86_64-asm_64.S"
@@ -333,6 +334,7 @@ ENTRY(twofish_ecb_enc_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -342,6 +344,7 @@ ENTRY(twofish_ecb_enc_8way)
 
 	store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_ecb_enc_8way)
 
@@ -351,6 +354,7 @@ ENTRY(twofish_ecb_dec_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -360,6 +364,7 @@ ENTRY(twofish_ecb_dec_8way)
 
 	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_ecb_dec_8way)
 
@@ -369,6 +374,7 @@ ENTRY(twofish_cbc_dec_8way)
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -383,6 +389,7 @@ ENTRY(twofish_cbc_dec_8way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_cbc_dec_8way)
 
@@ -393,6 +400,7 @@ ENTRY(twofish_ctr_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (little endian, 128bit)
 	 */
+	FRAME_BEGIN
 
 	pushq %r12;
 
@@ -408,6 +416,7 @@ ENTRY(twofish_ctr_8way)
 
 	popq %r12;
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_ctr_8way)
 
@@ -418,6 +427,7 @@ ENTRY(twofish_xts_enc_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -430,6 +440,7 @@ ENTRY(twofish_xts_enc_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_xts_enc_8way)
 
@@ -440,6 +451,7 @@ ENTRY(twofish_xts_dec_8way)
 	 *	%rdx: src
 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 	 */
+	FRAME_BEGIN
 
 	movq %rsi, %r11;
 
@@ -452,5 +464,6 @@ ENTRY(twofish_xts_dec_8way)
 	/* dst <= regs xor IVs(in dst) */
 	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
+	FRAME_END
 	ret;
 ENDPROC(twofish_xts_dec_8way)
-- 
cgit 


From 058fb73274f9e7eb72acc9836cbb2c4a9f9659a0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:20 -0600
Subject: x86/asm/entry: Create stack frames in thunk functions

Thunk functions are callable non-leaf functions that don't honor
CONFIG_FRAME_POINTER, which can result in bad stack traces.  Also they
aren't annotated as ELF callable functions which can confuse tooling.

Create stack frames for them when CONFIG_FRAME_POINTER is enabled and
add the ELF function type.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/4373e5bff459b9fd66ce5d45bfcc881a5c202643.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/thunk_64.S | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index efb2b932b748..98df1fa8825c 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -8,11 +8,14 @@
 #include <linux/linkage.h>
 #include "calling.h"
 #include <asm/asm.h>
+#include <asm/frame.h>
 
 	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
 	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
+	.type \name, @function
 \name:
+	FRAME_BEGIN
 
 	/* this one pushes 9 elems, the next one would be %rIP */
 	pushq %rdi
@@ -62,6 +65,7 @@ restore:
 	popq %rdx
 	popq %rsi
 	popq %rdi
+	FRAME_END
 	ret
 	_ASM_NOKPROBE(restore)
 #endif
-- 
cgit 


From 13523309495cdbd57a0d344c0d5d574987af007f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:21 -0600
Subject: x86/asm/acpi: Create a stack frame in do_suspend_lowlevel()

do_suspend_lowlevel() is a callable non-leaf function which doesn't
honor CONFIG_FRAME_POINTER, which can result in bad stack traces.

Create a stack frame for it when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/7383d87dd40a460e0d757a0793498b9d06a7ee0d.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/wakeup_64.S | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8c35df468104..169963f471bb 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -5,6 +5,7 @@
 #include <asm/page_types.h>
 #include <asm/msr.h>
 #include <asm/asm-offsets.h>
+#include <asm/frame.h>
 
 # Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
 
@@ -39,6 +40,7 @@ bogus_64_magic:
 	jmp	bogus_64_magic
 
 ENTRY(do_suspend_lowlevel)
+	FRAME_BEGIN
 	subq	$8, %rsp
 	xorl	%eax, %eax
 	call	save_processor_state
@@ -109,6 +111,7 @@ ENTRY(do_suspend_lowlevel)
 
 	xorl	%eax, %eax
 	addq	$8, %rsp
+	FRAME_END
 	jmp	restore_processor_state
 ENDPROC(do_suspend_lowlevel)
 
-- 
cgit 


From 3387a535ce629906d849864ef6a3c3437a645cb5 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:22 -0600
Subject: x86/asm: Create stack frames in rwsem functions

rwsem.S has several callable non-leaf functions which don't honor
CONFIG_FRAME_POINTER, which can result in bad stack traces.

Create stack frames for them when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/ad0932bbead975b15f9578e4f2cf2ee5961eb840.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/rwsem.S | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 40027db99140..be110efa0096 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -15,6 +15,7 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
+#include <asm/frame.h>
 
 #define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
 #define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
@@ -84,24 +85,29 @@
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
+	FRAME_BEGIN
 	save_common_regs
 	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
 	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
+	FRAME_END
 	ret
 ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
+	FRAME_BEGIN
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_down_write_failed
 	restore_common_regs
+	FRAME_END
 	ret
 ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
+	FRAME_BEGIN
 	/* do nothing if still outstanding active readers */
 	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
 	jnz 1f
@@ -109,15 +115,18 @@ ENTRY(call_rwsem_wake)
 	movq %rax,%rdi
 	call rwsem_wake
 	restore_common_regs
-1:	ret
+1:	FRAME_END
+	ret
 ENDPROC(call_rwsem_wake)
 
 ENTRY(call_rwsem_downgrade_wake)
+	FRAME_BEGIN
 	save_common_regs
 	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
 	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
+	FRAME_END
 	ret
 ENDPROC(call_rwsem_downgrade_wake)
-- 
cgit 


From 779c433b8ea5c9fdfb892265b2ca6213d1f12ff8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:23 -0600
Subject: x86/asm/efi: Create a stack frame in efi_call()

efi_call() is a callable non-leaf function which doesn't honor
CONFIG_FRAME_POINTER, which can result in bad stack traces.

Create a stack frame for it when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/2294b6fad60eea4cc862eddc8e98a1324e6eeeca.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_stub_64.S | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 86d0f9e08dd9..0df2dcc18404 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -11,6 +11,7 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/page_types.h>
+#include <asm/frame.h>
 
 #define SAVE_XMM			\
 	mov %rsp, %rax;			\
@@ -74,6 +75,7 @@
 	.endm
 
 ENTRY(efi_call)
+	FRAME_BEGIN
 	SAVE_XMM
 	mov (%rsp), %rax
 	mov 8(%rax), %rax
@@ -88,6 +90,7 @@ ENTRY(efi_call)
 	RESTORE_PGT
 	addq $48, %rsp
 	RESTORE_XMM
+	FRAME_END
 	ret
 ENDPROC(efi_call)
 
-- 
cgit 


From ef0f3ed5a4acfb24740480bf2e50b178724f094d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:24 -0600
Subject: x86/asm/power: Create stack frames in hibernate_asm_64.S

swsusp_arch_suspend() and restore_registers() are callable non-leaf
functions which don't honor CONFIG_FRAME_POINTER, which can result in
bad stack traces.  Also they aren't annotated as ELF callable functions
which can confuse tooling.

Create a stack frame for them when CONFIG_FRAME_POINTER is enabled and
give them proper ELF function annotations.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/bdad00205897dc707aebe9e9e39757085e2bf999.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/power/hibernate_asm_64.S | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index e2386cb4e0c3..4400a43b9e28 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -21,8 +21,10 @@
 #include <asm/page_types.h>
 #include <asm/asm-offsets.h>
 #include <asm/processor-flags.h>
+#include <asm/frame.h>
 
 ENTRY(swsusp_arch_suspend)
+	FRAME_BEGIN
 	movq	$saved_context, %rax
 	movq	%rsp, pt_regs_sp(%rax)
 	movq	%rbp, pt_regs_bp(%rax)
@@ -50,7 +52,9 @@ ENTRY(swsusp_arch_suspend)
 	movq	%rax, restore_cr3(%rip)
 
 	call swsusp_save
+	FRAME_END
 	ret
+ENDPROC(swsusp_arch_suspend)
 
 ENTRY(restore_image)
 	/* switch to temporary page tables */
@@ -107,6 +111,7 @@ ENTRY(core_restore_code)
 	 */
 
 ENTRY(restore_registers)
+	FRAME_BEGIN
 	/* go back to the original page tables */
 	movq    %rbx, %cr3
 
@@ -147,4 +152,6 @@ ENTRY(restore_registers)
 	/* tell the hibernation core that we've just restored the memory */
 	movq	%rax, in_suspend(%rip)
 
+	FRAME_END
 	ret
+ENDPROC(restore_registers)
-- 
cgit 


From f05058c4d652b619adfda6c78d8f5b341169c264 Mon Sep 17 00:00:00 2001
From: Chris J Arges <chris.j.arges@canonical.com>
Date: Thu, 21 Jan 2016 16:49:25 -0600
Subject: x86/uaccess: Add stack frame output operand in get_user() inline asm

Numerous 'call without frame pointer save/setup' warnings are introduced
by stacktool because of functions using the get_user() macro. Bad stack
traces could occur due to lack of or misplacement of stack frame setup
code.

This patch forces a stack frame to be created before the inline asm code
if CONFIG_FRAME_POINTER is enabled by listing the stack pointer as an
output operand for the get_user() inline assembly statement.

Signed-off-by: Chris J Arges <chris.j.arges@canonical.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/bc85501f221ee512670797c7f110022e64b12c81.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uaccess.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4b2d34..9bbb3b2d0372 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -179,10 +179,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 ({									\
 	int __ret_gu;							\
 	register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);		\
+	register void *__sp asm(_ASM_SP);				\
 	__chk_user_ptr(ptr);						\
 	might_fault();							\
-	asm volatile("call __get_user_%P3"				\
-		     : "=a" (__ret_gu), "=r" (__val_gu)			\
+	asm volatile("call __get_user_%P4"				\
+		     : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp)	\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
 	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
 	__builtin_expect(__ret_gu, 0);					\
-- 
cgit 


From 2d8fe90a1b96d52c2a3f719c385b846b02f0bcd8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:26 -0600
Subject: x86/asm/bpf: Annotate callable functions

bpf_jit.S has several functions which can be called from C code.  Give
them proper ELF annotations.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/bbe1de0c299fecd4fc9a1766bae8be2647bedb01.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/net/bpf_jit.S | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 4093216b3791..eb4a3bd293eb 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -22,15 +22,16 @@
 	32 /* space for rbx,r13,r14,r15 */ + \
 	8 /* space for skb_copy_bits */)
 
-sk_load_word:
-	.globl	sk_load_word
+#define FUNC(name) \
+	.globl name; \
+	.type name, @function; \
+	name:
 
+FUNC(sk_load_word)
 	test	%esi,%esi
 	js	bpf_slow_path_word_neg
 
-sk_load_word_positive_offset:
-	.globl	sk_load_word_positive_offset
-
+FUNC(sk_load_word_positive_offset)
 	mov	%r9d,%eax		# hlen
 	sub	%esi,%eax		# hlen - offset
 	cmp	$3,%eax
@@ -39,15 +40,11 @@ sk_load_word_positive_offset:
 	bswap   %eax  			/* ntohl() */
 	ret
 
-sk_load_half:
-	.globl	sk_load_half
-
+FUNC(sk_load_half)
 	test	%esi,%esi
 	js	bpf_slow_path_half_neg
 
-sk_load_half_positive_offset:
-	.globl	sk_load_half_positive_offset
-
+FUNC(sk_load_half_positive_offset)
 	mov	%r9d,%eax
 	sub	%esi,%eax		#	hlen - offset
 	cmp	$1,%eax
@@ -56,15 +53,11 @@ sk_load_half_positive_offset:
 	rol	$8,%ax			# ntohs()
 	ret
 
-sk_load_byte:
-	.globl	sk_load_byte
-
+FUNC(sk_load_byte)
 	test	%esi,%esi
 	js	bpf_slow_path_byte_neg
 
-sk_load_byte_positive_offset:
-	.globl	sk_load_byte_positive_offset
-
+FUNC(sk_load_byte_positive_offset)
 	cmp	%esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
 	jle	bpf_slow_path_byte
 	movzbl	(SKBDATA,%rsi),%eax
@@ -120,8 +113,8 @@ bpf_slow_path_byte:
 bpf_slow_path_word_neg:
 	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
 	jl	bpf_error	/* offset lower -> error  */
-sk_load_word_negative_offset:
-	.globl	sk_load_word_negative_offset
+
+FUNC(sk_load_word_negative_offset)
 	sk_negative_common(4)
 	mov	(%rax), %eax
 	bswap	%eax
@@ -130,8 +123,8 @@ sk_load_word_negative_offset:
 bpf_slow_path_half_neg:
 	cmp	SKF_MAX_NEG_OFF, %esi
 	jl	bpf_error
-sk_load_half_negative_offset:
-	.globl	sk_load_half_negative_offset
+
+FUNC(sk_load_half_negative_offset)
 	sk_negative_common(2)
 	mov	(%rax),%ax
 	rol	$8,%ax
@@ -141,8 +134,8 @@ sk_load_half_negative_offset:
 bpf_slow_path_byte_neg:
 	cmp	SKF_MAX_NEG_OFF, %esi
 	jl	bpf_error
-sk_load_byte_negative_offset:
-	.globl	sk_load_byte_negative_offset
+
+FUNC(sk_load_byte_negative_offset)
 	sk_negative_common(1)
 	movzbl	(%rax), %eax
 	ret
-- 
cgit 


From d21001cc15ba9f63b0334d60942278587471a451 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:27 -0600
Subject: x86/asm/bpf: Create stack frames in bpf_jit.S

bpf_jit.S has several callable non-leaf functions which don't honor
CONFIG_FRAME_POINTER, which can result in bad stack traces.

Create a stack frame before the call instructions when
CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/fa4c41976b438b51954cb8021f06bceb1d1d66cc.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/net/bpf_jit.S | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index eb4a3bd293eb..f2a7faf4706e 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -8,6 +8,7 @@
  * of the License.
  */
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 /*
  * Calling convention :
@@ -65,16 +66,18 @@ FUNC(sk_load_byte_positive_offset)
 
 /* rsi contains offset and can be scratched */
 #define bpf_slow_path_common(LEN)		\
+	lea	-MAX_BPF_STACK + 32(%rbp), %rdx;\
+	FRAME_BEGIN;				\
 	mov	%rbx, %rdi; /* arg1 == skb */	\
 	push	%r9;				\
 	push	SKBDATA;			\
 /* rsi already has offset */			\
 	mov	$LEN,%ecx;	/* len */	\
-	lea	- MAX_BPF_STACK + 32(%rbp),%rdx;			\
 	call	skb_copy_bits;			\
 	test    %eax,%eax;			\
 	pop	SKBDATA;			\
-	pop	%r9;
+	pop	%r9;				\
+	FRAME_END
 
 
 bpf_slow_path_word:
@@ -99,6 +102,7 @@ bpf_slow_path_byte:
 	ret
 
 #define sk_negative_common(SIZE)				\
+	FRAME_BEGIN;						\
 	mov	%rbx, %rdi; /* arg1 == skb */			\
 	push	%r9;						\
 	push	SKBDATA;					\
@@ -108,6 +112,7 @@ bpf_slow_path_byte:
 	test	%rax,%rax;					\
 	pop	SKBDATA;					\
 	pop	%r9;						\
+	FRAME_END;						\
 	jz	bpf_error
 
 bpf_slow_path_word_neg:
-- 
cgit 


From c1c355ce14c037666fbcb9453d9067c86bbdda5c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:28 -0600
Subject: x86/kprobes: Get rid of kretprobe_trampoline_holder()

The kretprobe_trampoline_holder() wrapper around kretprobe_trampoline()
isn't used anywhere and adds some unnecessary frame pointer instructions
which never execute.  Instead, just make kretprobe_trampoline() a proper
ELF function.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/92d921b102fb865a7c254cfde9e4a0a72b9a781e.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 57 +++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6cc873..5b187dfbfc8b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -671,38 +671,37 @@ NOKPROBE_SYMBOL(kprobe_int3_handler);
  * When a retprobed function returns, this code saves registers and
  * calls trampoline_handler() runs, which calls the kretprobe's handler.
  */
-static void __used kretprobe_trampoline_holder(void)
-{
-	asm volatile (
-			".global kretprobe_trampoline\n"
-			"kretprobe_trampoline: \n"
+asm(
+	".global kretprobe_trampoline\n"
+	".type kretprobe_trampoline, @function\n"
+	"kretprobe_trampoline:\n"
 #ifdef CONFIG_X86_64
-			/* We don't bother saving the ss register */
-			"	pushq %rsp\n"
-			"	pushfq\n"
-			SAVE_REGS_STRING
-			"	movq %rsp, %rdi\n"
-			"	call trampoline_handler\n"
-			/* Replace saved sp with true return address. */
-			"	movq %rax, 152(%rsp)\n"
-			RESTORE_REGS_STRING
-			"	popfq\n"
+	/* We don't bother saving the ss register */
+	"	pushq %rsp\n"
+	"	pushfq\n"
+	SAVE_REGS_STRING
+	"	movq %rsp, %rdi\n"
+	"	call trampoline_handler\n"
+	/* Replace saved sp with true return address. */
+	"	movq %rax, 152(%rsp)\n"
+	RESTORE_REGS_STRING
+	"	popfq\n"
 #else
-			"	pushf\n"
-			SAVE_REGS_STRING
-			"	movl %esp, %eax\n"
-			"	call trampoline_handler\n"
-			/* Move flags to cs */
-			"	movl 56(%esp), %edx\n"
-			"	movl %edx, 52(%esp)\n"
-			/* Replace saved flags with true return address. */
-			"	movl %eax, 56(%esp)\n"
-			RESTORE_REGS_STRING
-			"	popf\n"
+	"	pushf\n"
+	SAVE_REGS_STRING
+	"	movl %esp, %eax\n"
+	"	call trampoline_handler\n"
+	/* Move flags to cs */
+	"	movl 56(%esp), %edx\n"
+	"	movl %edx, 52(%esp)\n"
+	/* Replace saved flags with true return address. */
+	"	movl %eax, 56(%esp)\n"
+	RESTORE_REGS_STRING
+	"	popf\n"
 #endif
-			"	ret\n");
-}
-NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+	"	ret\n"
+	".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+);
 NOKPROBE_SYMBOL(kretprobe_trampoline);
 
 /*
-- 
cgit 


From 1482a0825bdf82dab4074bd3c824f4c87cbdf848 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:29 -0600
Subject: x86/kvm: Set ELF function type for fastop functions

The callable functions created with the FOP* and FASTOP* macros are
missing ELF function annotations, which confuses tools like stacktool.
Properly annotate them.

This adds some additional labels to the assembly, but the generated
binary code is unchanged (with the exception of instructions which have
embedded references to __LINE__).

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/e399651c89ace54906c203c0557f66ed6ea3ce8d.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/emulate.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1505587d06e9..aa4d72667878 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -309,23 +309,29 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
 
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 
-#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
+#define FOP_FUNC(name) \
+	".align " __stringify(FASTOP_SIZE) " \n\t" \
+	".type " name ", @function \n\t" \
+	name ":\n\t"
+
 #define FOP_RET   "ret \n\t"
 
 #define FOP_START(op) \
 	extern void em_##op(struct fastop *fake); \
 	asm(".pushsection .text, \"ax\" \n\t" \
 	    ".global em_" #op " \n\t" \
-            FOP_ALIGN \
-	    "em_" #op ": \n\t"
+	    FOP_FUNC("em_" #op)
 
 #define FOP_END \
 	    ".popsection")
 
-#define FOPNOP() FOP_ALIGN FOP_RET
+#define FOPNOP() \
+	FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \
+	FOP_RET
 
 #define FOP1E(op,  dst) \
-	FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst) \
+	"10: " #op " %" #dst " \n\t" FOP_RET
 
 #define FOP1EEX(op,  dst) \
 	FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
@@ -357,7 +363,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 #define FOP2E(op,  dst, src)	   \
-	FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst "_" #src) \
+	#op " %" #src ", %" #dst " \n\t" FOP_RET
 
 #define FASTOP2(op) \
 	FOP_START(op) \
@@ -395,7 +402,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 #define FOP3E(op,  dst, src, src2) \
-	FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+	#op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
 
 /* 3-operand, word-only, src2=cl */
 #define FASTOP3WCL(op) \
@@ -407,7 +415,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 /* Special case for SETcc - 1 instruction per cc */
-#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+#define FOP_SETCC(op) \
+	".align 4 \n\t" \
+	".type " #op ", @function \n\t" \
+	#op ": \n\t" \
+	#op " %al \n\t" \
+	FOP_RET
 
 asm(".global kvm_fastop_exception \n"
     "kvm_fastop_exception: xor %esi, %esi; ret");
-- 
cgit 


From cb7390fed4c04e609a420ac0b1c07a7a781b43bf Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 22 Jan 2016 10:16:12 -0600
Subject: x86/kvm: Make test_cc() always inline

With some configs (including allyesconfig), gcc doesn't inline
test_cc().  When that happens, test_cc() doesn't create a stack frame
before inserting the inline asm call instruction.  This breaks frame
pointer convention if CONFIG_FRAME_POINTER is enabled and can result in
a bad stack trace.

Force it to always be inlined so that its containing function's stack
frame can be used.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/20160122161612.GE20502@treble.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aa4d72667878..80363ebfd0a6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -969,7 +969,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
 	return fastop(ctxt, em_bsr);
 }
 
-static u8 test_cc(unsigned int condition, unsigned long flags)
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
 {
 	u8 rc;
 	void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
-- 
cgit 


From 16df4ff8604881db0130f93f4b6ade759fa48e87 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2016 16:49:32 -0600
Subject: x86/locking: Create stack frame in PV unlock

The assembly PV_UNLOCK function is a callable non-leaf function which
doesn't honor CONFIG_FRAME_POINTER, which can result in bad stack
traces.

Create a stack frame when CONFIG_FRAME_POINTER is enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/6685a72ddbbd0ad3694337cca0af4b4ea09f5f40.1453405861.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/qspinlock_paravirt.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 9f92c180ed2f..9d55f9b6e167 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -36,8 +36,10 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
  */
 asm    (".pushsection .text;"
 	".globl " PV_UNLOCK ";"
+	".type " PV_UNLOCK ", @function;"
 	".align 4,0x90;"
 	PV_UNLOCK ": "
+	FRAME_BEGIN
 	"push  %rdx;"
 	"mov   $0x1,%eax;"
 	"xor   %edx,%edx;"
@@ -45,6 +47,7 @@ asm    (".pushsection .text;"
 	"cmp   $0x1,%al;"
 	"jne   .slowpath;"
 	"pop   %rdx;"
+	FRAME_END
 	"ret;"
 	".slowpath: "
 	"push   %rsi;"
@@ -52,6 +55,7 @@ asm    (".pushsection .text;"
 	"call " PV_UNLOCK_SLOWPATH ";"
 	"pop    %rsi;"
 	"pop    %rdx;"
+	FRAME_END
 	"ret;"
 	".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
 	".popsection");
-- 
cgit 


From 3f62de5f6f369b67b7ac709e3c942c9130d2c51a Mon Sep 17 00:00:00 2001
From: Chris J Arges <chris.j.arges@canonical.com>
Date: Fri, 22 Jan 2016 15:44:38 -0600
Subject: x86/kvm: Add output operand in vmx_handle_external_intr inline asm

Stacktool generates the following warning:
  stacktool: arch/x86/kvm/vmx.o: vmx_handle_external_intr()+0x67: call without frame pointer save/setup

By adding the stackpointer as an output operand, this patch ensures that a
stack frame is created when CONFIG_FRAME_POINTER is enabled for the inline
assmebly statement.

Signed-off-by: Chris J Arges <chris.j.arges@canonical.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: gleb@kernel.org
Cc: kvm@vger.kernel.org
Cc: live-patching@vger.kernel.org
Cc: pbonzini@redhat.com
Link: http://lkml.kernel.org/r/1453499078-9330-3-git-send-email-chris.j.arges@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6edbbc..e1535225bc1d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8356,6 +8356,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 {
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	register void *__sp asm(_ASM_SP);
 
 	/*
 	 * If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8388,8 +8389,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			"call *%[entry]\n\t"
 			:
 #ifdef CONFIG_X86_64
-			[sp]"=&r"(tmp)
+			[sp]"=&r"(tmp),
 #endif
+			"+r"(__sp)
 			:
 			[entry]"r"(entry),
 			[ss]"i"(__KERNEL_DS),
-- 
cgit 


From 821eae7d14f0bbf69df1cc4656c54900b2672928 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 18 Feb 2016 11:41:58 -0600
Subject: sched/x86: Add stack frame dependency to
 __preempt_schedule[_notrace]()

If __preempt_schedule() or __preempt_schedule_notrace() is referenced at
the beginning of a function, gcc can insert the asm inline "call
___preempt_schedule[_notrace]" instruction before setting up a stack
frame, which breaks frame pointer convention if CONFIG_FRAME_POINTER is
enabled and can result in bad stack traces.

Force a stack frame to be created if CONFIG_FRAME_POINTER is enabled by
listing the stack pointer as an output operand for the inline asm
statements.

Specifically this fixes the following stacktool warnings:

  stacktool: drivers/scsi/hpsa.o: hpsa_scsi_do_simple_cmd.constprop.106()+0x79: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_find_first()+0x70: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_find_first()+0x92: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_free()+0xff: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_free()+0xf5: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_free()+0x11a: call without frame pointer save/setup
  stacktool: fs/mbcache.o: mb_cache_entry_get()+0x225: call without frame pointer save/setup
  stacktool: kernel/locking/percpu-rwsem.o: percpu_up_read()+0x27: call without frame pointer save/setup
  stacktool: kernel/profile.o: do_profile_hits.isra.5()+0x139: call without frame pointer save/setup
  stacktool: lib/nmi_backtrace.o: nmi_trigger_all_cpu_backtrace()+0x2b6: call without frame pointer save/setup
  stacktool: net/rds/ib_cm.o: rds_ib_cq_comp_handler_recv()+0x58: call without frame pointer save/setup
  stacktool: net/rds/ib_cm.o: rds_ib_cq_comp_handler_send()+0x58: call without frame pointer save/setup
  stacktool: net/rds/ib_recv.o: rds_ib_attempt_ack()+0xc1: call without frame pointer save/setup
  stacktool: net/rds/iw_recv.o: rds_iw_attempt_ack()+0xc1: call without frame pointer save/setup
  stacktool: net/rds/iw_recv.o: rds_iw_recv_cq_comp_handler()+0x55: call without frame pointer save/setup

So it only adds a stack frame to 15 call sites out of ~5000 calls to
___preempt_schedule[_notrace]().  All the others already had stack frames.

Oddly, this change actually seems to make things faster in a lot of
cases.  For many smaller functions it causes the stack frame creation to
get moved out of the common path and into the unlikely path.

For example, here's the original cyc2ns_read_end():

  ffffffff8101f8c0 <cyc2ns_read_end>:
  ffffffff8101f8c0:	55                   	push   %rbp
  ffffffff8101f8c1:	48 89 e5             	mov    %rsp,%rbp
  ffffffff8101f8c4:	83 6f 10 01          	subl   $0x1,0x10(%rdi)
  ffffffff8101f8c8:	75 08                	jne    ffffffff8101f8d2 <cyc2ns_read_end+0x12>
  ffffffff8101f8ca:	65 48 89 3d e6 5a ff 	mov    %rdi,%gs:0x7eff5ae6(%rip)        # 153b8 <cyc2ns+0x38>
  ffffffff8101f8d1:	7e
  ffffffff8101f8d2:	65 ff 0d 77 c4 fe 7e 	decl   %gs:0x7efec477(%rip)        # bd50 <__preempt_count>
  ffffffff8101f8d9:	74 02                	je     ffffffff8101f8dd <cyc2ns_read_end+0x1d>
  ffffffff8101f8db:	5d                   	pop    %rbp
  ffffffff8101f8dc:	c3                   	retq
  ffffffff8101f8dd:	e8 1e 37 fe ff       	callq  ffffffff81003000 <___preempt_schedule>
  ffffffff8101f8e2:	5d                   	pop    %rbp
  ffffffff8101f8e3:	c3                   	retq
  ffffffff8101f8e4:	66 66 66 2e 0f 1f 84 	data16 data16 nopw %cs:0x0(%rax,%rax,1)
  ffffffff8101f8eb:	00 00 00 00 00

And here's the same function with the patch:

  ffffffff8101f8c0 <cyc2ns_read_end>:
  ffffffff8101f8c0:	83 6f 10 01          	subl   $0x1,0x10(%rdi)
  ffffffff8101f8c4:	75 08                	jne    ffffffff8101f8ce <cyc2ns_read_end+0xe>
  ffffffff8101f8c6:	65 48 89 3d ea 5a ff 	mov    %rdi,%gs:0x7eff5aea(%rip)        # 153b8 <cyc2ns+0x38>
  ffffffff8101f8cd:	7e
  ffffffff8101f8ce:	65 ff 0d 7b c4 fe 7e 	decl   %gs:0x7efec47b(%rip)        # bd50 <__preempt_count>
  ffffffff8101f8d5:	74 01                	je     ffffffff8101f8d8 <cyc2ns_read_end+0x18>
  ffffffff8101f8d7:	c3                   	retq
  ffffffff8101f8d8:	55                   	push   %rbp
  ffffffff8101f8d9:	48 89 e5             	mov    %rsp,%rbp
  ffffffff8101f8dc:	e8 1f 37 fe ff       	callq  ffffffff81003000 <___preempt_schedule>
  ffffffff8101f8e1:	5d                   	pop    %rbp
  ffffffff8101f8e2:	c3                   	retq
  ffffffff8101f8e3:	66 66 66 66 2e 0f 1f 	data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
  ffffffff8101f8ea:	84 00 00 00 00 00

Notice that it moved the frame pointer setup code to the unlikely
___preempt_schedule() call path.  Going through a sampling of the
differences in the asm, that's the most common change I see.

Otherwise it has no real effect on callers which already have stack
frames (though it does result in the reordering of some 'mov's).

Reported-by: Jiri Slaby <jslaby@suse.cz>
Tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/20160218174158.GA28230@treble.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/preempt.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 01bcde84d3e4..d397deb58146 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -94,10 +94,19 @@ static __always_inline bool should_resched(int preempt_offset)
 
 #ifdef CONFIG_PREEMPT
   extern asmlinkage void ___preempt_schedule(void);
-# define __preempt_schedule() asm ("call ___preempt_schedule")
+# define __preempt_schedule()					\
+({								\
+	register void *__sp asm(_ASM_SP);			\
+	asm volatile ("call ___preempt_schedule" : "+r"(__sp));	\
+})
+
   extern asmlinkage void preempt_schedule(void);
   extern asmlinkage void ___preempt_schedule_notrace(void);
-# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+# define __preempt_schedule_notrace()					\
+({									\
+	register void *__sp asm(_ASM_SP);				\
+	asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp));	\
+})
   extern asmlinkage void preempt_schedule_notrace(void);
 #endif
 
-- 
cgit 


From 04d1d281dcfe683a53cddfab8371fc8bb302b069 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 23 Feb 2016 13:19:29 -0800
Subject: x86/entry/32: Add an ASM_CLAC to entry_SYSENTER_32

Both before and after 5f310f739b4c ("x86/entry/32: Re-implement
SYSENTER using the new C path"), we relied on a uaccess very early
in the SYSENTER path to clear AC.  After that change, though, we can
potentially make it all the way into C code with AC set, which
enlarges the attack surface for SMAP bypass by doing SYSENTER with
AC set.

Strengthen the SMAP protection by addding the missing ASM_CLAC right
at the beginning.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3e36be110724896e32a4a1fe73bacb349d3cba94.1456262295.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 77d8c5112900..bb3e376d0f33 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,6 +294,7 @@ sysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
 	pushfl				/* pt_regs->flags (except IF = 0) */
+	ASM_CLAC			/* Clear AC after saving FLAGS */
 	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
-- 
cgit 


From 6a6256f9e0ebaabf7ded1fef8977a4352dbe7784 Mon Sep 17 00:00:00 2001
From: Adam Buchbinder <adam.buchbinder@gmail.com>
Date: Tue, 23 Feb 2016 15:34:30 -0800
Subject: x86: Fix misspellings in comments

Signed-off-by: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: trivial@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c                              | 2 +-
 arch/x86/include/asm/ftrace.h                                     | 2 +-
 arch/x86/include/asm/rwsem.h                                      | 2 +-
 arch/x86/kernel/acpi/boot.c                                       | 4 ++--
 arch/x86/kernel/apic/apic.c                                       | 2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c                                | 2 +-
 arch/x86/kernel/apm_32.c                                          | 2 +-
 arch/x86/kernel/cpu/amd.c                                         | 2 +-
 arch/x86/kernel/cpu/common.c                                      | 4 ++--
 arch/x86/kernel/cpu/mtrr/generic.c                                | 2 +-
 arch/x86/kernel/cpu/perf_event.h                                  | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_lbr.c                        | 2 +-
 arch/x86/kernel/early_printk.c                                    | 2 +-
 arch/x86/kernel/fpu/regset.c                                      | 2 +-
 arch/x86/kernel/kexec-bzimage64.c                                 | 2 +-
 arch/x86/kernel/kgdb.c                                            | 4 ++--
 arch/x86/kernel/kvmclock.c                                        | 2 +-
 arch/x86/kernel/tsc.c                                             | 2 +-
 arch/x86/kvm/mmu.c                                                | 4 ++--
 arch/x86/kvm/vmx.c                                                | 4 ++--
 arch/x86/kvm/x86.c                                                | 2 +-
 arch/x86/lib/memset_64.S                                          | 2 +-
 arch/x86/mm/mpx.c                                                 | 4 ++--
 arch/x86/mm/pat.c                                                 | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_bma023.c         | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_emc1403.c        | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c      | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_lis331.c         | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_max7315.c        | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c        | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic.c           | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c     | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c   | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c      | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c       | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c   | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c      | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c       | 2 +-
 arch/x86/platform/intel-mid/device_libs/platform_tca6416.c        | 2 +-
 arch/x86/purgatory/stack.S                                        | 2 +-
 arch/x86/xen/mmu.c                                                | 8 ++++----
 arch/x86/xen/xen-head.S                                           | 2 +-
 43 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 1a50e09c945b..03c3eb77bfce 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static cycle_t vread_tsc(void)
 
 	/*
 	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
+	 * predictable (it's just a function of time and the likely is
 	 * very likely) and there's a data dependence, so force GCC
 	 * to generate a branch instead.  I don't barrier() because
 	 * we don't actually need a barrier, and if this function
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 24938852db30..abbad505dd24 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -52,7 +52,7 @@ int ftrace_int3_handler(struct pt_regs *regs);
  * this screws up the trace output when tracing a ia32 task.
  * Instead of reporting bogus syscalls, just do not trace them.
  *
- * If the user realy wants these, then they should use the
+ * If the user really wants these, then they should use the
  * raw syscall tracepoints with filtering.
  */
 #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index cad82c9c2fde..ceec86eb68e9 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -25,7 +25,7 @@
  * This should be totally fair - if anything is waiting, a process that wants a
  * lock will go to the back of the queue. When the currently active lock is
  * released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consequtive readers at the
+ * that will be woken up; if there's a bunch of consecutive readers at the
  * front, then they'll all be woken up, but no other readers will be.
  */
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e75907601a41..8c2f1ef6ca23 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -956,7 +956,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 
 	/*
 	 * Note that the LAPIC address is obtained from the MADT (32-bit value)
-	 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+	 * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
 	 */
 
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
@@ -984,7 +984,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 
 	/*
 	 * Note that the LAPIC address is obtained from the MADT (32-bit value)
-	 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+	 * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
 	 */
 
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8a5cddac7d44..3b892bbdd902 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1611,7 +1611,7 @@ void __init enable_IR_x2apic(void)
 	legacy_pic->mask_all();
 	mask_ioapic_entries();
 
-	/* If irq_remapping_prepare() succeded, try to enable it */
+	/* If irq_remapping_prepare() succeeded, try to enable it */
 	if (ir_stat >= 0)
 		ir_stat = try_to_enable_IR();
 	/* ir_stat contains the remap mode or an error code */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 624db00583f4..278070381aee 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -860,7 +860,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
  */
 void uv_cpu_init(void)
 {
-	/* CPU 0 initilization will be done via uv_system_init. */
+	/* CPU 0 initialization will be done via uv_system_init. */
 	if (!uv_blade_info)
 		return;
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 052c9c3026cc..9307f182fe30 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1088,7 +1088,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
  *	@device: identity of device
  *	@enable: on/off
  *
- *	Activate or deactive power management on either a specific device
+ *	Activate or deactivate power management on either a specific device
  *	or the entire system (%APM_DEVICE_ALL).
  */
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 97c59fd60702..b39338c4b260 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -82,7 +82,7 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_32
 /*
  * General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
  * drivers subsequently pokes it, and changes the CPU speed.
  * Workaround : Remove the unneeded alias.
  */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68a80e9b67fc..7c3120f5177b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -884,7 +884,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
-	/* Clear/Set all flags overriden by options, after probe */
+	/* Clear/Set all flags overridden by options, after probe */
 	for (i = 0; i < NCAPINTS; i++) {
 		c->x86_capability[i] &= ~cpu_caps_cleared[i];
 		c->x86_capability[i] |= cpu_caps_set[i];
@@ -943,7 +943,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	x86_init_cache_qos(c);
 
 	/*
-	 * Clear/Set all flags overriden by options, need do it
+	 * Clear/Set all flags overridden by options, need do it
 	 * before following smp all cpus cap AND.
 	 */
 	for (i = 0; i < NCAPINTS; i++) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fcbcb2f678ca..19f57360dfd2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(mtrr_state);
  * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
  * Opteron Processors" (26094 Rev. 3.30 February 2006), section
  * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
  * 0 for operation."
  */
 static inline void k8_check_syscfg_dram_mod_en(void)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7bb61e32fb29..3ea127f05d0d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -272,7 +272,7 @@ struct cpu_hw_events {
  * events to select for counter rescheduling.
  *
  * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
+ * will increase scheduling cycles for an over-committed system
  * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
  * and its counter masks must be kept at a minimum.
  */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 653f88d25987..288f22ddc6d8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -649,7 +649,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 
 /*
  * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
+ * instruction is not necessarily a branch (in case of interrupt).
  *
  * The branch type returned also includes the priv level of the
  * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 21bf92490a7b..8a121991e5ba 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -287,7 +287,7 @@ static __init void early_pci_serial_init(char *s)
 	}
 
 	/*
-	 * Lastly, initalize the hardware
+	 * Lastly, initialize the hardware
 	 */
 	if (*s) {
 		if (strcmp(s, "nocfg") == 0)
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 0bc3490420c5..8bd1c003942a 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,7 +8,7 @@
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
  * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
+ * capabilities supported by the xsave.
  */
 int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 0f8a6bbaaa44..2af478e3fd4e 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -271,7 +271,7 @@ static int bzImage64_probe(const char *buf, unsigned long len)
 	int ret = -ENOEXEC;
 	struct setup_header *header;
 
-	/* kernel should be atleast two sectors long */
+	/* kernel should be at least two sectors long */
 	if (len < 2 * 512) {
 		pr_err("File is too short to be a bzImage\n");
 		return ret;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 44256a62702b..631a7087e332 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -609,9 +609,9 @@ static struct notifier_block kgdb_notifier = {
 };
 
 /**
- *	kgdb_arch_init - Perform any architecture specific initalization.
+ *	kgdb_arch_init - Perform any architecture specific initialization.
  *
- *	This function will handle the initalization of any architecture
+ *	This function will handle the initialization of any architecture
  *	specific callbacks.
  */
 int kgdb_arch_init(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 72cef58693c7..1d39bfbd26bb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -226,7 +226,7 @@ static void kvm_setup_secondary_clock(void)
  * registered memory location. If the guest happens to shutdown, this memory
  * won't be valid. In cases like kexec, in which you install a new kernel, this
  * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writting anything
+ * kind of shutdown from our side, we unregister the clock by writing anything
  * that does not have the 'enable' bit set in the msr
  */
 #ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da828d3..acec49b302d1 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -876,7 +876,7 @@ void tsc_restore_sched_clock_state(void)
 	local_irq_save(flags);
 
 	/*
-	 * We're comming out of suspend, there's no concurrency yet; don't
+	 * We're coming out of suspend, there's no concurrency yet; don't
 	 * bother being nice about the RCU stuff, just write to both
 	 * data fields.
 	 */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955de5964..e1bb320dd5b2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -478,7 +478,7 @@ static bool spte_is_locklessly_modifiable(u64 spte)
 static bool spte_has_volatile_bits(u64 spte)
 {
 	/*
-	 * Always atomicly update spte if it can be updated
+	 * Always atomically update spte if it can be updated
 	 * out of mmu-lock, it can ensure dirty bit is not lost,
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
@@ -549,7 +549,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
-	 * we always atomicly update it, see the comments in
+	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
 	if (spte_is_locklessly_modifiable(old_spte) &&
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6edbbc..34208bf57c35 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5475,7 +5475,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 		return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
 static void handle_clts(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu)) {
@@ -7223,7 +7223,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	/* The value to write might be 32 or 64 bits, depending on L1's long
 	 * mode, and eventually we need to write that into a field of several
 	 * possible lengths. The code below first zero-extends the value to 64
-	 * bit (field_value), and then copies only the approriate number of
+	 * bit (field_value), and then copies only the appropriate number of
 	 * bits into the vmcs12 field.
 	 */
 	u64 field_value = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4244c2baf57d..429c3f5fc618 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1562,7 +1562,7 @@ static cycle_t read_tsc(void)
 
 	/*
 	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
+	 * predictable (it's just a function of time and the likely is
 	 * very likely) and there's a data dependence, so force GCC
 	 * to generate a branch instead.  I don't barrier() because
 	 * we don't actually need a barrier, and if this function
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2661fad05827..f9962ad4e9c4 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -9,7 +9,7 @@
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
- * simpler and shorter than the orignal function as well.
+ * simpler and shorter than the original function as well.
  *
  * rdi   destination
  * rsi   value (char)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b2fd67da1701..cca5174f86fe 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
 
 	/*
 	 * This covers 32-bit emulation as well as 32-bit kernels
-	 * running on 64-bit harware.
+	 * running on 64-bit hardware.
 	 */
 	if (!is_64bit_mm(mm))
 		return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
 
 	/*
 	 * 'x86_virt_bits' returns what the hardware is capable
-	 * of, and returns the full >32-bit adddress space when
+	 * of, and returns the full >32-bit address space when
 	 * running 32-bit kernels on 64-bit hardware.
 	 */
 	virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f4ae536b0914..fa14b04b9f64 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -149,7 +149,7 @@ enum {
 	PAT_WT = 4,		/* Write Through */
 	PAT_WP = 5,		/* Write Protected */
 	PAT_WB = 6,		/* Write Back (default) */
-	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+	PAT_UC_MINUS = 7,	/* UC, but can be overridden by MTRR */
 };
 
 #define CM(c) (_PAGE_CACHE_MODE_ ## c)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
index 0ae7f2ae2296..c26cf393d35a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
@@ -1,5 +1,5 @@
 /*
- * platform_bma023.c: bma023 platform data initilization file
+ * platform_bma023.c: bma023 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  *
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 69a783689d21..c259fb6c8f4f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -1,5 +1,5 @@
 /*
- * platform_emc1403.c: emc1403 platform data initilization file
+ * platform_emc1403.c: emc1403 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index dccae6b0413f..52534ec29765 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -1,5 +1,5 @@
 /*
- * platform_gpio_keys.c: gpio_keys platform data initilization file
+ * platform_gpio_keys.c: gpio_keys platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 54226de7541a..a35cf912de43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -1,5 +1,5 @@
 /*
- * platform_lis331.c:  lis331 platform data initilization file
+ * platform_lis331.c:  lis331 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 2c8acbc1e9ad..6e075afa7877 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -1,5 +1,5 @@
 /*
- * platform_max7315.c: max7315 platform data initilization file
+ * platform_max7315.c: max7315 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index cfe9a47a1e87..ee22864bbc2f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -1,5 +1,5 @@
 /*
- * platform_mpu3050.c: mpu3050 platform data initilization file
+ * platform_mpu3050.c: mpu3050 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
index 9f4a775a69d6..e421106c11cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic.c: MSIC platform data initilization file
+ * platform_msic.c: MSIC platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
index 29629397d2b3..cb3490ecb341 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_audio.c: MSIC audio platform data initilization file
+ * platform_msic_audio.c: MSIC audio platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
index f446c33df1a8..4f72193939a6 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_battery.c: MSIC battery platform data initilization file
+ * platform_msic_battery.c: MSIC battery platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
index 2a4f7b1dd917..70de5b531ba0 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_gpio.c: MSIC GPIO platform data initilization file
+ * platform_msic_gpio.c: MSIC GPIO platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
index 6497111ddb54..3d7c2011b6cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_ocd.c: MSIC OCD platform data initilization file
+ * platform_msic_ocd.c: MSIC OCD platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
index 83a3459bc337..038f618fbc52 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_power_btn.c: MSIC power btn platform data initilization file
+ * platform_msic_power_btn.c: MSIC power btn platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
index a351878b96bc..114a5755b1e4 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -1,5 +1,5 @@
 /*
- * platform_msic_thermal.c: msic_thermal platform data initilization file
+ * platform_msic_thermal.c: msic_thermal platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index 65c2a9a19db4..e30cb62e3300 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -1,5 +1,5 @@
 /*
- * platform_pmic_gpio.c: PMIC GPIO platform data initilization file
+ * platform_pmic_gpio.c: PMIC GPIO platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index 740fc757050c..b1526b95fd43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -1,5 +1,5 @@
 /*
- * platform_tc35876x.c: tc35876x platform data initilization file
+ * platform_tc35876x.c: tc35876x platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 33be0b3be6e1..4f41372ce400 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -1,5 +1,5 @@
 /*
- * platform_tca6416.c: tca6416 platform data initilization file
+ * platform_tca6416.c: tca6416 platform data initialization file
  *
  * (C) Copyright 2013 Intel Corporation
  * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
index 3cefba1fefc8..50a4147f91fb 100644
--- a/arch/x86/purgatory/stack.S
+++ b/arch/x86/purgatory/stack.S
@@ -8,7 +8,7 @@
  */
 
 	/* A stack for the loaded kernel.
-	 * Seperate and in the data section so it can be prepopulated.
+	 * Separate and in the data section so it can be prepopulated.
 	 */
 	.data
 	.balign 4096
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c913ca4f6958..478a2de543a5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1256,7 +1256,7 @@ static void __init xen_pagetable_cleanhighmap(void)
 	xen_cleanhighmap(addr, addr + size);
 	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
 #ifdef DEBUG
-	/* This is superflous and is not neccessary, but you know what
+	/* This is superfluous and is not necessary, but you know what
 	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
 	 * anything at this stage. */
 	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
@@ -1474,7 +1474,7 @@ static void xen_write_cr3(unsigned long cr3)
 /*
  * At the start of the day - when Xen launches a guest, it has already
  * built pagetables for the guest. We diligently look over them
- * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * in xen_setup_kernel_pagetable and graft as appropriate them in the
  * init_level4_pgt and its friends. Then when we are happy we load
  * the new init_level4_pgt - and continue on.
  *
@@ -2792,7 +2792,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	struct remap_data *rmd = data;
 	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
 
-	/* If we have a contigious range, just update the mfn itself,
+	/* If we have a contiguous range, just update the mfn itself,
 	   else update pointer to be "next mfn". */
 	if (rmd->contiguous)
 		(*rmd->mfn)++;
@@ -2833,7 +2833,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
 
 	rmd.mfn = gfn;
 	rmd.prot = prot;
-	/* We use the err_ptr to indicate if there we are doing a contigious
+	/* We use the err_ptr to indicate if there we are doing a contiguous
 	 * mapping or a discontigious mapping. */
 	rmd.contiguous = !err_ptr;
 
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index b65f59a358a2..dc6457017dec 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -26,7 +26,7 @@
 		      (1 << XENFEAT_auto_translated_physmap) | \
 		      (1 << XENFEAT_supervisor_mode_kernel) | \
 		      (1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
  * up regardless whether this CONFIG option is enabled or not, but it
  * clarifies what the right flags need to be.
  */
-- 
cgit 


From 172b2386ed16a9143d9a456aae5ec87275c61489 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 10 Feb 2016 17:50:23 +0100
Subject: KVM: x86: fix missed hardware breakpoints

Sometimes when setting a breakpoint a process doesn't stop on it.
This is because the debug registers are not loaded correctly on
VCPU load.

The following simple reproducer from Oleg Nesterov tries using debug
registers in two threads.  To see the bug, run a 2-VCPU guest with
"taskset -c 0" and run "./bp 0 1" inside the guest.

    #include <unistd.h>
    #include <signal.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <sys/wait.h>
    #include <sys/ptrace.h>
    #include <sys/user.h>
    #include <asm/debugreg.h>
    #include <assert.h>

    #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)

    unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len)
    {
        unsigned long dr7;

        dr7 = ((len | type) & 0xf)
            << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
        if (enable)
            dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));

        return dr7;
    }

    int write_dr(int pid, int dr, unsigned long val)
    {
        return ptrace(PTRACE_POKEUSER, pid,
                offsetof (struct user, u_debugreg[dr]),
                val);
    }

    void set_bp(pid_t pid, void *addr)
    {
        unsigned long dr7;
        assert(write_dr(pid, 0, (long)addr) == 0);
        dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1);
        assert(write_dr(pid, 7, dr7) == 0);
    }

    void *get_rip(int pid)
    {
        return (void*)ptrace(PTRACE_PEEKUSER, pid,
                offsetof(struct user, regs.rip), 0);
    }

    void test(int nr)
    {
        void *bp_addr = &&label + nr, *bp_hit;
        int pid;

        printf("test bp %d\n", nr);
        assert(nr < 16); // see 16 asm nops below

        pid = fork();
        if (!pid) {
            assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
            kill(getpid(), SIGSTOP);
            for (;;) {
                label: asm (
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                    "nop; nop; nop; nop;"
                );
            }
        }

        assert(pid == wait(NULL));
        set_bp(pid, bp_addr);

        for (;;) {
            assert(ptrace(PTRACE_CONT, pid, 0, 0) == 0);
            assert(pid == wait(NULL));

            bp_hit = get_rip(pid);
            if (bp_hit != bp_addr)
                fprintf(stderr, "ERR!! hit wrong bp %ld != %d\n",
                    bp_hit - &&label, nr);
        }
    }

    int main(int argc, const char *argv[])
    {
        while (--argc) {
            int nr = atoi(*++argv);
            if (!fork())
                test(nr);
        }

        while (wait(NULL) > 0)
            ;
        return 0;
    }

Cc: stable@vger.kernel.org
Suggested-by: Nadav Amit <namit@cs.technion.ac.il>
Reported-by: Andrey Wagin <avagin@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4244c2baf57d..f4891f2ece23 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2752,6 +2752,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+	vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-- 
cgit 


From 0c1d77f4ba5cc9c05a29adca3d6466cdf4969b70 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 19 Feb 2016 18:07:21 +0100
Subject: KVM: x86: fix conversion of addresses to linear in 32-bit protected
 mode

Commit e8dd2d2d641c ("Silence compiler warning in arch/x86/kvm/emulate.c",
2015-09-06) broke boot of the Hurd.  The bug is that the "default:"
case actually could modify "la", but after the patch this change is
not reflected in *linear.

The bug is visible whenever a non-zero segment base causes the linear
address to wrap around the 4GB mark.

Fixes: e8dd2d2d641cb2724ee10e76c0ad02e04289c017
Cc: stable@vger.kernel.org
Reported-by: Aurelien Jarno <aurelien@aurel32.net>
Tested-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1505587d06e9..b9b09fec173b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -650,10 +650,10 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	u16 sel;
 
 	la = seg_base(ctxt, addr.seg) + addr.ea;
-	*linear = la;
 	*max_size = 0;
 	switch (mode) {
 	case X86EMUL_MODE_PROT64:
+		*linear = la;
 		if (is_noncanonical_address(la))
 			goto bad;
 
@@ -662,6 +662,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 			goto bad;
 		break;
 	default:
+		*linear = la = (u32)la;
 		usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
 						addr.seg);
 		if (!usable)
@@ -689,7 +690,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 			if (size > *max_size)
 				goto bad;
 		}
-		la &= (u32)-1;
 		break;
 	}
 	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
-- 
cgit 


From 48fcb2d0216103d15306caa4814e2381104df6d8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 11 Jan 2016 11:47:49 +0100
Subject: efi: stub: use high allocation for converted command line

Before we can move the command line processing before the allocation
of the kernel, which is required for detecting the 'nokaslr' option
which controls that allocation, move the converted command line higher
up in memory, to prevent it from interfering with the kernel itself.

Since x86 needs the address to fit in 32 bits, use UINT_MAX as the upper
bound there. Otherwise, use ULONG_MAX (i.e., no limit)

Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/x86/include/asm/efi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78c4998..08b1f2f6ea50 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -25,6 +25,8 @@
 #define EFI32_LOADER_SIGNATURE	"EL32"
 #define EFI64_LOADER_SIGNATURE	"EL64"
 
+#define MAX_CMDLINE_ADDRESS	UINT_MAX
+
 #ifdef CONFIG_X86_32
 
 
-- 
cgit 


From 3d44d51bd339766f0178f0cf2e8d048b4a4872aa Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 24 Feb 2016 12:18:49 -0800
Subject: x86/entry/compat: Add missing CLAC to entry_INT80_32

This doesn't seem to fix a regression -- I don't think the CLAC was
ever there.

I double-checked in a debugger: entries through the int80 gate do
not automatically clear AC.

Stable maintainers: I can provide a backport to 4.3 and earlier if
needed.  This needs to be backported all the way to 3.10.

Reported-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org> # v3.10 and later
Fixes: 63bcff2a307b ("x86, smap: Add STAC and CLAC instructions to control user space access")
Link: http://lkml.kernel.org/r/b02b7e71ae54074be01fc171cbd4b72517055c0e.1456345086.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ff1c6d61f332..3c990eeee40b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -261,6 +261,7 @@ ENTRY(entry_INT80_compat)
 	 * Interrupts are off on entry.
 	 */
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	ASM_CLAC			/* Do this early to minimize exposure */
 	SWAPGS
 
 	/*
-- 
cgit 


From 17e4bce0ae63c7e03f3c7fa8d80890e7af3d4971 Mon Sep 17 00:00:00 2001
From: Mike Krinkin <krinkin.m.u@gmail.com>
Date: Wed, 24 Feb 2016 21:02:31 +0300
Subject: KVM: x86: MMU: fix ubsan index-out-of-range warning

Ubsan reports the following warning due to a typo in
update_accessed_dirty_bits template, the patch fixes
the typo:

[  168.791851] ================================================================================
[  168.791862] UBSAN: Undefined behaviour in arch/x86/kvm/paging_tmpl.h:252:15
[  168.791866] index 4 is out of range for type 'u64 [4]'
[  168.791871] CPU: 0 PID: 2950 Comm: qemu-system-x86 Tainted: G           O L  4.5.0-rc5-next-20160222 #7
[  168.791873] Hardware name: LENOVO 23205NG/23205NG, BIOS G2ET95WW (2.55 ) 07/09/2013
[  168.791876]  0000000000000000 ffff8801cfcaf208 ffffffff81c9f780 0000000041b58ab3
[  168.791882]  ffffffff82eb2cc1 ffffffff81c9f6b4 ffff8801cfcaf230 ffff8801cfcaf1e0
[  168.791886]  0000000000000004 0000000000000001 0000000000000000 ffffffffa1981600
[  168.791891] Call Trace:
[  168.791899]  [<ffffffff81c9f780>] dump_stack+0xcc/0x12c
[  168.791904]  [<ffffffff81c9f6b4>] ? _atomic_dec_and_lock+0xc4/0xc4
[  168.791910]  [<ffffffff81da9e81>] ubsan_epilogue+0xd/0x8a
[  168.791914]  [<ffffffff81daafa2>] __ubsan_handle_out_of_bounds+0x15c/0x1a3
[  168.791918]  [<ffffffff81daae46>] ? __ubsan_handle_shift_out_of_bounds+0x2bd/0x2bd
[  168.791922]  [<ffffffff811287ef>] ? get_user_pages_fast+0x2bf/0x360
[  168.791954]  [<ffffffffa1794050>] ? kvm_largepages_enabled+0x30/0x30 [kvm]
[  168.791958]  [<ffffffff81128530>] ? __get_user_pages_fast+0x360/0x360
[  168.791987]  [<ffffffffa181b818>] paging64_walk_addr_generic+0x1b28/0x2600 [kvm]
[  168.792014]  [<ffffffffa1819cf0>] ? init_kvm_mmu+0x1100/0x1100 [kvm]
[  168.792019]  [<ffffffff8129e350>] ? debug_check_no_locks_freed+0x350/0x350
[  168.792044]  [<ffffffffa1819cf0>] ? init_kvm_mmu+0x1100/0x1100 [kvm]
[  168.792076]  [<ffffffffa181c36d>] paging64_gva_to_gpa+0x7d/0x110 [kvm]
[  168.792121]  [<ffffffffa181c2f0>] ? paging64_walk_addr_generic+0x2600/0x2600 [kvm]
[  168.792130]  [<ffffffff812e848b>] ? debug_lockdep_rcu_enabled+0x7b/0x90
[  168.792178]  [<ffffffffa17d9a4a>] emulator_read_write_onepage+0x27a/0x1150 [kvm]
[  168.792208]  [<ffffffffa1794d44>] ? __kvm_read_guest_page+0x54/0x70 [kvm]
[  168.792234]  [<ffffffffa17d97d0>] ? kvm_task_switch+0x160/0x160 [kvm]
[  168.792238]  [<ffffffff812e848b>] ? debug_lockdep_rcu_enabled+0x7b/0x90
[  168.792263]  [<ffffffffa17daa07>] emulator_read_write+0xe7/0x6d0 [kvm]
[  168.792290]  [<ffffffffa183b620>] ? em_cr_write+0x230/0x230 [kvm]
[  168.792314]  [<ffffffffa17db005>] emulator_write_emulated+0x15/0x20 [kvm]
[  168.792340]  [<ffffffffa18465f8>] segmented_write+0xf8/0x130 [kvm]
[  168.792367]  [<ffffffffa1846500>] ? em_lgdt+0x20/0x20 [kvm]
[  168.792374]  [<ffffffffa14db512>] ? vmx_read_guest_seg_ar+0x42/0x1e0 [kvm_intel]
[  168.792400]  [<ffffffffa1846d82>] writeback+0x3f2/0x700 [kvm]
[  168.792424]  [<ffffffffa1846990>] ? em_sidt+0xa0/0xa0 [kvm]
[  168.792449]  [<ffffffffa185554d>] ? x86_decode_insn+0x1b3d/0x4f70 [kvm]
[  168.792474]  [<ffffffffa1859032>] x86_emulate_insn+0x572/0x3010 [kvm]
[  168.792499]  [<ffffffffa17e71dd>] x86_emulate_instruction+0x3bd/0x2110 [kvm]
[  168.792524]  [<ffffffffa17e6e20>] ? reexecute_instruction.part.110+0x2e0/0x2e0 [kvm]
[  168.792532]  [<ffffffffa14e9a81>] handle_ept_misconfig+0x61/0x460 [kvm_intel]
[  168.792539]  [<ffffffffa14e9a20>] ? handle_pause+0x450/0x450 [kvm_intel]
[  168.792546]  [<ffffffffa15130ea>] vmx_handle_exit+0xd6a/0x1ad0 [kvm_intel]
[  168.792572]  [<ffffffffa17f6a6c>] ? kvm_arch_vcpu_ioctl_run+0xbdc/0x6090 [kvm]
[  168.792597]  [<ffffffffa17f6bcd>] kvm_arch_vcpu_ioctl_run+0xd3d/0x6090 [kvm]
[  168.792621]  [<ffffffffa17f6a6c>] ? kvm_arch_vcpu_ioctl_run+0xbdc/0x6090 [kvm]
[  168.792627]  [<ffffffff8293b530>] ? __ww_mutex_lock_interruptible+0x1630/0x1630
[  168.792651]  [<ffffffffa17f5e90>] ? kvm_arch_vcpu_runnable+0x4f0/0x4f0 [kvm]
[  168.792656]  [<ffffffff811eeb30>] ? preempt_notifier_unregister+0x190/0x190
[  168.792681]  [<ffffffffa17e0447>] ? kvm_arch_vcpu_load+0x127/0x650 [kvm]
[  168.792704]  [<ffffffffa178e9a3>] kvm_vcpu_ioctl+0x553/0xda0 [kvm]
[  168.792727]  [<ffffffffa178e450>] ? vcpu_put+0x40/0x40 [kvm]
[  168.792732]  [<ffffffff8129e350>] ? debug_check_no_locks_freed+0x350/0x350
[  168.792735]  [<ffffffff82946087>] ? _raw_spin_unlock+0x27/0x40
[  168.792740]  [<ffffffff8163a943>] ? handle_mm_fault+0x1673/0x2e40
[  168.792744]  [<ffffffff8129daa8>] ? trace_hardirqs_on_caller+0x478/0x6c0
[  168.792747]  [<ffffffff8129dcfd>] ? trace_hardirqs_on+0xd/0x10
[  168.792751]  [<ffffffff812e848b>] ? debug_lockdep_rcu_enabled+0x7b/0x90
[  168.792756]  [<ffffffff81725a80>] do_vfs_ioctl+0x1b0/0x12b0
[  168.792759]  [<ffffffff817258d0>] ? ioctl_preallocate+0x210/0x210
[  168.792763]  [<ffffffff8174aef3>] ? __fget+0x273/0x4a0
[  168.792766]  [<ffffffff8174acd0>] ? __fget+0x50/0x4a0
[  168.792770]  [<ffffffff8174b1f6>] ? __fget_light+0x96/0x2b0
[  168.792773]  [<ffffffff81726bf9>] SyS_ioctl+0x79/0x90
[  168.792777]  [<ffffffff82946880>] entry_SYSCALL_64_fastpath+0x23/0xc1
[  168.792780] ================================================================================

Signed-off-by: Mike Krinkin <krinkin.m.u@gmail.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c9fed957cce..2ce4f05e81d3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -249,7 +249,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 			return ret;
 
 		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
-		walker->ptes[level] = pte;
+		walker->ptes[level - 1] = pte;
 	}
 	return 0;
 }
-- 
cgit 


From d89abe2a1f0c92499eedc815e4a9b2881f4959a5 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 9 Feb 2016 02:43:02 +0100
Subject: arch/x86/irq: Purge useless handler declarations from hw_irq.h

arch/x86/include/asm/hw_irq.h contains declarations for the C-level handlers
called into directly from the IDT-referenced assembly stubs. These
declarations are never used as they are referenced from assembly only.

Furthermore, these declarations got their attributes wrong: there is no
'__irqentry' (parameter passing via stack) attached to them.

Also, the list of declarations isn't complete: none of the tracing-capable
variants is declared, for example.

Purge the handler declarations.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1815b736269d..f564d2b2275f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -168,20 +168,6 @@ extern atomic_t irq_mis_count;
 
 extern void elcr_set_level_irq(unsigned int irq);
 
-/* SMP */
-extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
-extern __visible void smp_spurious_interrupt(struct pt_regs *);
-extern __visible void smp_x86_platform_ipi(struct pt_regs *);
-extern __visible void smp_error_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_IO_APIC
-extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
-#endif
-#ifdef CONFIG_SMP
-extern __visible void smp_reschedule_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-#endif
-
 extern char irq_entries_start[];
 #ifdef CONFIG_TRACING
 #define trace_irq_entries_start irq_entries_start
-- 
cgit 


From 405e1133d00e0271cedef75c17ecb773ff3e2732 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 10 Feb 2016 02:03:00 -0700
Subject: x86/mm: Avoid premature success when changing page attributes

set_memory_nx() (and set_memory_x()) currently differ in behavior from
all other set_memory_*() functions when encountering a virtual address
space hole within the kernel address range: They stop processing at the
hole, but nevertheless report success (making the caller believe the
operation was carried out on the entire range). While observed to be a
problem - triggering the CONFIG_DEBUG_WX warning - only with out of
tree code, I suspect (but didn't check) that on x86-64 the
CONFIG_DEBUG_PAGEALLOC logic in free_init_pages() would, when called
from free_initmem(), have the same effect on the set_memory_nx() called
from mark_rodata_ro().

This unexpected behavior is a result of change_page_attr_set_clr()
special casing changes to only the NX bit, in that it passes "false" as
the "checkalias" argument to __change_page_attr_set_clr(). Since this
flag becomes the "primary" argument of both __change_page_attr() and
__cpa_process_fault(), the latter would so far return success without
adjusting cpa->numpages. Success to the higher level callers, however,
means that whatever cpa->numpages currently holds is the count of
successfully processed pages. The cases when __change_page_attr() calls
__cpa_process_fault(), otoh, don't generally mean the entire range got
processed (as can be seen from one of the two success return paths in
__cpa_process_fault() already adjusting ->numpages).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/56BB0AD402000078000D05BF@prv-mh.provo.novell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814b0069..3dd6afd2c0e5 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1122,8 +1122,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 	/*
 	 * Ignore all non primary paths.
 	 */
-	if (!primary)
+	if (!primary) {
+		cpa->numpages = 1;
 		return 0;
+	}
 
 	/*
 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
-- 
cgit 


From 8577370fb0cbe88266b7583d8d3b9f43ced077a0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 19 Feb 2016 09:46:39 +0100
Subject: KVM: Use simple waitqueue for vcpu->wq

The problem:

On -rt, an emulated LAPIC timer instances has the following path:

1) hard interrupt
2) ksoftirqd is scheduled
3) ksoftirqd wakes up vcpu thread
4) vcpu thread is scheduled

This extra context switch introduces unnecessary latency in the
LAPIC path for a KVM guest.

The solution:

Allow waking up vcpu thread from hardirq context,
thus avoiding the need for ksoftirqd to be scheduled.

Normal waitqueues make use of spinlocks, which on -RT
are sleepable locks. Therefore, waking up a waitqueue
waiter involves locking a sleeping lock, which
is not allowed from hard interrupt context.

cyclictest command line:

This patch reduces the average latency in my tests from 14us to 11us.

Daniel writes:
Paolo asked for numbers from kvm-unit-tests/tscdeadline_latency
benchmark on mainline. The test was run 1000 times on
tip/sched/core 4.4.0-rc8-01134-g0905f04:

  ./x86-run x86/tscdeadline_latency.flat -cpu host

with idle=poll.

The test seems not to deliver really stable numbers though most of
them are smaller. Paolo write:

"Anything above ~10000 cycles means that the host went to C1 or
lower---the number means more or less nothing in that case.

The mean shows an improvement indeed."

Before:

               min             max         mean           std
count  1000.000000     1000.000000  1000.000000   1000.000000
mean   5162.596000  2019270.084000  5824.491541  20681.645558
std      75.431231   622607.723969    89.575700   6492.272062
min    4466.000000    23928.000000  5537.926500    585.864966
25%    5163.000000  1613252.750000  5790.132275  16683.745433
50%    5175.000000  2281919.000000  5834.654000  23151.990026
75%    5190.000000  2382865.750000  5861.412950  24148.206168
max    5228.000000  4175158.000000  6254.827300  46481.048691

After
               min            max         mean           std
count  1000.000000     1000.00000  1000.000000   1000.000000
mean   5143.511000  2076886.10300  5813.312474  21207.357565
std      77.668322   610413.09583    86.541500   6331.915127
min    4427.000000    25103.00000  5529.756600    559.187707
25%    5148.000000  1691272.75000  5784.889825  17473.518244
50%    5160.000000  2308328.50000  5832.025000  23464.837068
75%    5172.000000  2393037.75000  5853.177675  24223.969976
max    5222.000000  3922458.00000  6186.720500  42520.379830

[Patch was originaly based on the swait implementation found in the -rt
 tree. Daniel ported it to mainline's version and gathered the
 benchmark numbers for tscdeadline_latency test.]

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-4-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/lapic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591faed13b..3a045f39ed81 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
-	wait_queue_head_t *q = &vcpu->wq;
+	struct swait_queue_head *q = &vcpu->wq;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
+	if (swait_active(q))
+		swake_up(q);
 
 	if (apic_lvtt_tscdeadline(apic))
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
-- 
cgit 


From bf70e5513dfea29c3682e7eb3dbb45f0723bac09 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Thu, 25 Feb 2016 01:58:12 -0800
Subject: x86/mm: Fix slow_virt_to_phys() for X86_PAE again

"d1cd12108346: x86, pageattr: Prevent overflow in slow_virt_to_phys() for
X86_PAE" was unintentionally removed by the recent "34437e67a672: x86/mm: Fix
slow_virt_to_phys() to handle large PAT bit".

And, the variable 'phys_addr' was defined as "unsigned long" by mistake -- it should
be "phys_addr_t".

As a result, Hyper-V network driver in 32-PAE Linux guest can't work again.

Fixes: commit 34437e67a672: "x86/mm: Fix slow_virt_to_phys() to handle large PAT bit"
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Cc: olaf@aepfle.de
Cc: gregkh@linuxfoundation.org
Cc: jasowang@redhat.com
Cc: driverdev-devel@linuxdriverproject.org
Cc: linux-mm@kvack.org
Cc: apw@canonical.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Link: http://lkml.kernel.org/r/1456394292-9030-1-git-send-email-decui@microsoft.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814b0069..9cf96d82147a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -419,24 +419,30 @@ pmd_t *lookup_pmd_address(unsigned long address)
 phys_addr_t slow_virt_to_phys(void *__virt_addr)
 {
 	unsigned long virt_addr = (unsigned long)__virt_addr;
-	unsigned long phys_addr, offset;
+	phys_addr_t phys_addr;
+	unsigned long offset;
 	enum pg_level level;
 	pte_t *pte;
 
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
 
+	/*
+	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
+	 * make 32-PAE kernel work correctly.
+	 */
 	switch (level) {
 	case PG_LEVEL_1G:
-		phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
 		offset = virt_addr & ~PUD_PAGE_MASK;
 		break;
 	case PG_LEVEL_2M:
-		phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
 		offset = virt_addr & ~PMD_PAGE_MASK;
 		break;
 	default:
-		phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 		offset = virt_addr & ~PAGE_MASK;
 	}
 
-- 
cgit 


From 70e4da7a8ff62f2775337b705f45c804bb450454 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 26 Feb 2016 12:28:40 +0100
Subject: KVM: x86: fix root cause for missed hardware breakpoints

Commit 172b2386ed16 ("KVM: x86: fix missed hardware breakpoints",
2016-02-10) worked around a case where the debug registers are not loaded
correctly on preemption and on the first entry to KVM_RUN.

However, Xiao Guangrong pointed out that the root cause must be that
KVM_DEBUGREG_BP_ENABLED is not being set correctly.  This can indeed
happen due to the lazy debug exit mechanism, which does not call
kvm_update_dr7.  Fix it by replacing the existing loop (more or less
equivalent to kvm_update_dr0123) with calls to all the kvm_update_dr*
functions.

Cc: stable@vger.kernel.org   # 4.1+
Fixes: 172b2386ed16a9143d9a456aae5ec87275c61489
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4891f2ece23..eaf6ee8c28b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2752,7 +2752,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-	vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -6619,12 +6618,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * KVM_DEBUGREG_WONT_EXIT again.
 	 */
 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-		int i;
-
 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
 		kvm_x86_ops->sync_dirty_debug_regs(vcpu);
-		for (i = 0; i < KVM_NR_DB_REGS; i++)
-			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		kvm_update_dr0123(vcpu);
+		kvm_update_dr6(vcpu);
+		kvm_update_dr7(vcpu);
+		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
 	/*
-- 
cgit 


From 9bf148cb0812595bfdf5100bd2c07e9bec9c6ef5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 26 Feb 2016 18:55:31 +0000
Subject: x86/mpx: Fix off-by-one comparison with nr_registers

In the unlikely event that regno == nr_registers then we get an array
overrun on regoff because the invalid register check is currently
off-by-one. Fix this with a check that regno is >= nr_registers instead.

Detected with static analysis using CoverityScan.

Fixes: fcc7ffd67991 "x86, mpx: Decode MPX instruction to get bound violation information"
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1456512931-3388-1-git-send-email-colin.king@canonical.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/mpx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b2fd67da1701..ef05755a1900 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -123,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
 		break;
 	}
 
-	if (regno > nr_registers) {
+	if (regno >= nr_registers) {
 		WARN_ONCE(1, "decoded an instruction with an invalid register");
 		return -EINVAL;
 	}
-- 
cgit 


From 6c777e8799a93e3bdb67bec622429e1b48dc90fb Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 17 Feb 2016 12:26:42 -0600
Subject: Revert "PCI, x86: Implement pcibios_alloc_irq() and
 pcibios_free_irq()"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

991de2e59090 ("PCI, x86: Implement pcibios_alloc_irq() and
pcibios_free_irq()") appeared in v4.3 and helps support IOAPIC hotplug.

Олег reported that the Elcus-1553 TA1-PCI driver worked in v4.2 but not
v4.3 and bisected it to 991de2e59090.  Sunjin reported that the RocketRAID
272x driver worked in v4.2 but not v4.3.  In both cases booting with
"pci=routirq" is a workaround.

I think the problem is that after 991de2e59090, we no longer call
pcibios_enable_irq() for upstream bridges.  Prior to 991de2e59090, when a
driver called pci_enable_device(), we recursively called
pcibios_enable_irq() for upstream bridges via pci_enable_bridge().

After 991de2e59090, we call pcibios_enable_irq() from pci_device_probe()
instead of the pci_enable_device() path, which does *not* call
pcibios_enable_irq() for upstream bridges.

Revert 991de2e59090 to fix these driver regressions.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=111211
Fixes: 991de2e59090 ("PCI, x86: Implement pcibios_alloc_irq() and pcibios_free_irq()")
Reported-and-tested-by: Олег Мороз <oleg.moroz@mcc.vniiem.ru>
Reported-by: Sunjin Yang <fan4326@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
CC: Jiang Liu <jiang.liu@linux.intel.com>
---
 arch/x86/include/asm/pci_x86.h |  2 ++
 arch/x86/pci/common.c          | 20 +++++++++++---------
 arch/x86/pci/intel_mid_pci.c   |  7 ++-----
 arch/x86/pci/irq.c             | 15 ++++++++++++++-
 4 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 46873fbd44e1..d08eacd298c2 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
+extern bool mp_should_keep_irq(struct device *dev);
+
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cb499c5b6ae5..d34b5118b4e8 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -711,20 +711,22 @@ int pcibios_add_device(struct pci_dev *dev)
 	return 0;
 }
 
-int pcibios_alloc_irq(struct pci_dev *dev)
+int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
-	return pcibios_enable_irq(dev);
-}
+	int err;
 
-void pcibios_free_irq(struct pci_dev *dev)
-{
-	if (pcibios_disable_irq)
-		pcibios_disable_irq(dev);
+	if ((err = pci_enable_resources(dev, mask)) < 0)
+		return err;
+
+	if (!pci_dev_msi_enabled(dev))
+		return pcibios_enable_irq(dev);
+	return 0;
 }
 
-int pcibios_enable_device(struct pci_dev *dev, int mask)
+void pcibios_disable_device (struct pci_dev *dev)
 {
-	return pci_enable_resources(dev, mask);
+	if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+		pcibios_disable_irq(dev);
 }
 
 int pci_ext_cfg_avail(void)
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 8826ff593ebc..8b93e634af84 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -256,13 +256,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (dev->irq_managed && dev->irq > 0) {
+	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+	    dev->irq > 0) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
-		/*
-		 * Don't reset dev->irq here, otherwise
-		 * intel_mid_pci_irq_enable() will fail on next call.
-		 */
 	}
 }
 
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 72108f0b66b1..9bd115484745 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1257,9 +1257,22 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	return 0;
 }
 
+bool mp_should_keep_irq(struct device *dev)
+{
+	if (dev->power.is_prepared)
+		return true;
+#ifdef CONFIG_PM
+	if (dev->power.runtime_status == RPM_SUSPENDING)
+		return true;
+#endif
+
+	return false;
+}
+
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
+	if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+	    dev->irq_managed && dev->irq) {
 		mp_unmap_irq(dev->irq);
 		dev->irq = 0;
 		dev->irq_managed = 0;
-- 
cgit 


From 5ef11c35ce86b94bfb878b684de4cdaf96f54b2f Mon Sep 17 00:00:00 2001
From: Daniel Cashman <dcashman@android.com>
Date: Fri, 26 Feb 2016 15:19:37 -0800
Subject: mm: ASLR: use get_random_long()

Replace calls to get_random_int() followed by a cast to (unsigned long)
with calls to get_random_long().  Also address shifting bug which, in
case of x86 removed entropy mask for mmap_rnd_bits values > 31 bits.

Signed-off-by: Daniel Cashman <dcashman@android.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nick Kralevich <nnk@google.com>
Cc: Jeff Vander Stoep <jeffv@google.com>
Cc: Mark Salyzyn <salyzyn@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 96bd1e2bffaf..72bb52f93c3d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void)
 
 	if (mmap_is_ia32())
 #ifdef CONFIG_COMPAT
-		rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
 #else
-		rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
 #endif
 	else
-		rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
 
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit 


From c0dd671686b2229e888ede77682ab0633b2a0dd7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 28 Feb 2016 22:22:34 -0600
Subject: objtool: Mark non-standard object files and directories

Code which runs outside the kernel's normal mode of operation often does
unusual things which can cause a static analysis tool like objtool to
emit false positive warnings:

 - boot image
 - vdso image
 - relocation
 - realmode
 - efi
 - head
 - purgatory
 - modpost

Set OBJECT_FILES_NON_STANDARD for their related files and directories,
which will tell objtool to skip checking them.  It's ok to skip them
because they don't affect runtime stack traces.

Also skip the following code which does the right thing with respect to
frame pointers, but is too "special" to be validated by a tool:

 - entry
 - mcount

Also skip the test_nx module because it modifies its exception handling
table at runtime, which objtool can't understand.  Fortunately it's
just a test module so it doesn't matter much.

Currently objtool is the only user of OBJECT_FILES_NON_STANDARD, but it
might eventually be useful for other tools.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/366c080e3844e8a5b6a0327dc7e8c2b90ca3baeb.1456719558.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/Makefile            |  3 ++-
 arch/x86/boot/compressed/Makefile |  3 ++-
 arch/x86/entry/Makefile           |  4 ++++
 arch/x86/entry/vdso/Makefile      |  6 ++++--
 arch/x86/kernel/Makefile          | 11 ++++++++---
 arch/x86/platform/efi/Makefile    |  2 ++
 arch/x86/purgatory/Makefile       |  2 ++
 arch/x86/realmode/Makefile        |  4 +++-
 arch/x86/realmode/rm/Makefile     |  3 ++-
 9 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index bbe1a62efc02..0bf6749522d9 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,7 +9,8 @@
 # Changed by many, many contributors over the years.
 #
 
-KASAN_SANITIZE := n
+KASAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
 
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f9ce75d80101..5e1d26e09407 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -16,7 +16,8 @@
 #	(see scripts/Makefile.lib size_append)
 #	compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
 
-KASAN_SANITIZE := n
+KASAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
 
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index bd55dedd7614..fe91c25092da 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -1,6 +1,10 @@
 #
 # Makefile for the x86 low level entry code
 #
+
+OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
+OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
 obj-y				+= common.o
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c854541d93ff..f9fb859c98b9 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,8 +3,9 @@
 #
 
 KBUILD_CFLAGS += $(DISABLE_LTO)
-KASAN_SANITIZE := n
-UBSAN_SANITIZE := n
+KASAN_SANITIZE			:= n
+UBSAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
 
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
@@ -16,6 +17,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
 
 # files to link into kernel
 obj-y				+= vma.o
+OBJECT_FILES_NON_STANDARD_vma.o	:= n
 
 # vDSO images to build
 vdso_img-$(VDSO64-y)		+= 64
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..d5fb0871aba3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,9 +16,14 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 endif
 
-KASAN_SANITIZE_head$(BITS).o := n
-KASAN_SANITIZE_dumpstack.o := n
-KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_head$(BITS).o				:= n
+KASAN_SANITIZE_dumpstack.o				:= n
+KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
+
+OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
+OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
+OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
+OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 2846aaab5103..066619b0700c 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+
 obj-$(CONFIG_EFI) 		+= quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
 obj-$(CONFIG_EARLY_PRINTK_EFI)	+= early_printk.o
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 2c835e356349..92e3e1d84c1d 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD := y
+
 purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
 
 targets += $(purgatory-y)
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index e02c2c6c56a5..682c895753d9 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,9 @@
 # for more details.
 #
 #
-KASAN_SANITIZE := n
+KASAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
+
 subdir- := rm
 
 obj-y += init.o
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3e75fcf6b836..053abe7b0ef7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,7 +6,8 @@
 # for more details.
 #
 #
-KASAN_SANITIZE := n
+KASAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
 
 always := realmode.bin realmode.relocs
 
-- 
cgit 


From 9a99417acbad99ba6c6a9389b45a53a4d002bb7e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 28 Feb 2016 22:22:35 -0600
Subject: objtool: Add STACK_FRAME_NON_STANDARD() macro

Add a new macro, STACK_FRAME_NON_STANDARD(), which is used to denote a
function which does something unusual related to its stack frame.  Use
of the macro prevents objtool from emitting a false positive warning.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/34487a17b23dba43c50941599d47054a9584b219.1456719558.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/vmlinux.lds.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 92dc211c11db..13fa0ad62df8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -343,7 +343,10 @@ SECTIONS
 
 	/* Sections to be discarded */
 	DISCARDS
-	/DISCARD/ : { *(.eh_frame) }
+	/DISCARD/ : {
+		*(.eh_frame)
+		*(__func_stack_frame_non_standard)
+	}
 }
 
 
-- 
cgit 


From 983bb6d254c77aaec581473e11c2eb859294f1f2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 28 Feb 2016 22:22:36 -0600
Subject: x86/xen: Mark xen_cpuid() stack frame as non-standard

objtool reports the following false positive warning:

  arch/x86/xen/enlighten.o: warning: objtool: xen_cpuid()+0x41: can't find jump dest instruction at .text+0x108

The warning is due to xen_cpuid()'s use of XEN_EMULATE_PREFIX to insert
some fake instructions which objtool doesn't know how to decode.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/bb88399840406629e3417831dc371ecd2842e2a6.1456719558.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/enlighten.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d09e4c9d7cc5..5c45a69ffe84 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/memblock.h>
 #include <linux/edd.h>
+#include <linux/frame.h>
 
 #ifdef CONFIG_KEXEC_CORE
 #include <linux/kexec.h>
@@ -351,8 +352,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 	*cx &= maskecx;
 	*cx |= setecx;
 	*dx &= maskedx;
-
 }
+STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
 
 static bool __init xen_check_mwait(void)
 {
-- 
cgit 


From 87aaff2ae09036cf699fde20dfd52ce7d3c8eabe Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 28 Feb 2016 22:22:40 -0600
Subject: x86/kprobes: Mark kretprobe_trampoline() stack frame as non-standard

objtool reports the following warning for kretprobe_trampoline():

  arch/x86/kernel/kprobes/core.o: warning: objtool: kretprobe_trampoline()+0x20: call without frame pointer save/setup

kretprobes are a special case where the stack is intentionally wrong.
The return address isn't known at the beginning of the trampoline, so
the stack frame can't be set up properly before it calls
trampoline_handler().

Because kretprobe handlers don't sleep, the frame pointer doesn't *have*
to be accurate in the trampoline.  So it's ok to tell objtool to ignore
it.  This results in no actual changes to the generated code.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/7eaf37de52456ff822ffc86b928edb5d48a40ef1.1456719558.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 48acaac35bd2..ae703acb85c1 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -49,6 +49,7 @@
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
+#include <linux/frame.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
@@ -703,6 +704,7 @@ asm(
 	".size kretprobe_trampoline, .-kretprobe_trampoline\n"
 );
 NOKPROBE_SYMBOL(kretprobe_trampoline);
+STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
 
 /*
  * Called from kretprobe_trampoline
-- 
cgit 


From d4883d5d6b146fd65f762c462b2c6d4a327c7d50 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 28 Feb 2016 22:22:43 -0600
Subject: objtool: Enable stack metadata validation on 64-bit x86

Set HAVE_STACK_VALIDATION to enable stack metadata validation for
x86_64.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/cdaeb6914d00a070c0f455cd06989bf3f787a2f6.1456719558.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f64c39..adc5a6d28735 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
 	select VIRT_TO_BUS
 	select X86_DEV_DMA_OPS			if X86_64
 	select X86_FEATURE_NAMES		if PROC_FS
+	select HAVE_STACK_VALIDATION		if X86_64
 
 config INSTRUCTION_DECODER
 	def_bool y
-- 
cgit 


From 6cb2f1d9af5b0f0afdd4e689d969df4b5c76a4c2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 24 Feb 2016 05:07:43 -0500
Subject: perf/x86/intel/uncore: Remove SBOX support for BDX-DE

BDX-DE and BDX-EP share the same uncore code path. But there is no sbox
in BDX-DE. This patch remove SBOX support for BDX-DE.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: <tonyb@cybernetics.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Battersby <tonyb@cybernetics.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/37D7C6CF3E00A74B8858931C1DB2F0770589D336@SHSMSX103.ccr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore_snbep.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 0c801f77c03d..d967fcc9d5fb 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2874,11 +2874,13 @@ static struct intel_uncore_type bdx_uncore_sbox = {
 	.format_group		= &hswep_uncore_sbox_format_group,
 };
 
+#define BDX_MSR_UNCORE_SBOX	3
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
-	&bdx_uncore_sbox,
 	&hswep_uncore_pcu,
+	&bdx_uncore_sbox,
 	NULL,
 };
 
@@ -2887,6 +2889,10 @@ void bdx_uncore_cpu_init(void)
 	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 	uncore_msr_uncores = bdx_msr_uncores;
+
+	/* BDX-DE doesn't have SBOX */
+	if (boot_cpu_data.x86_model == 86)
+		uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
 }
 
 static struct intel_uncore_type bdx_uncore_ha = {
-- 
cgit 


From 3ccca9eca67ec68461a5266832d02aee538599b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:08 +0000
Subject: perf/x86/intel/uncore: Remove pointless mask check

uncore_cpumask_init() is only ever called from intel_uncore_init() where the
mask is guaranteed to be empty.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.657326866@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 91a18d6c4405..c422d52ecefa 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1342,12 +1342,6 @@ static void __init uncore_cpumask_init(void)
 {
 	int cpu;
 
-	/*
-	 * ony invoke once from msr or pci init code
-	 */
-	if (!cpumask_empty(&uncore_cpu_mask))
-		return;
-
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
-- 
cgit 


From ffeda003803213a8d0babefdd6a95fe424884c14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:09 +0000
Subject: perf/x86/intel/uncore: Simplify error rollback

No point in doing partial rollbacks. Robustify uncore_exit_type() so it does
not dereference type->pmus unconditionally and remove all the partial rollback
hackery.

Preparatory patch for proper error handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.751077467@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 45 ++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c422d52ecefa..91facdc22d18 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -767,10 +767,12 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
 	int i;
 
-	for (i = 0; i < type->num_boxes; i++)
-		free_percpu(type->pmus[i].box);
-	kfree(type->pmus);
-	type->pmus = NULL;
+	if (type->pmus) {
+		for (i = 0; i < type->num_boxes; i++)
+			free_percpu(type->pmus[i].box);
+		kfree(type->pmus);
+		type->pmus = NULL;
+	}
 	kfree(type->events_group);
 	type->events_group = NULL;
 }
@@ -778,6 +780,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 static void __init uncore_types_exit(struct intel_uncore_type **types)
 {
 	int i;
+
 	for (i = 0; types[i]; i++)
 		uncore_type_exit(types[i]);
 }
@@ -806,7 +809,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 		INIT_LIST_HEAD(&pmus[i].box_list);
 		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
 		if (!pmus[i].box)
-			goto fail;
+			return -ENOMEM;
 	}
 
 	if (type->event_descs) {
@@ -817,7 +820,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
 		if (!attr_group)
-			goto fail;
+			return -ENOMEM;
 
 		attrs = (struct attribute **)(attr_group + 1);
 		attr_group->name = "events";
@@ -831,9 +834,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 
 	type->pmu_group = &uncore_pmu_attr_group;
 	return 0;
-fail:
-	uncore_type_exit(type);
-	return -ENOMEM;
 }
 
 static int __init uncore_types_init(struct intel_uncore_type **types)
@@ -843,13 +843,9 @@ static int __init uncore_types_init(struct intel_uncore_type **types)
 	for (i = 0; types[i]; i++) {
 		ret = uncore_type_init(types[i]);
 		if (ret)
-			goto fail;
+			return ret;
 	}
 	return 0;
-fail:
-	while (--i >= 0)
-		uncore_type_exit(types[i]);
-	return ret;
 }
 
 /*
@@ -1007,17 +1003,21 @@ static int __init uncore_pci_init(void)
 
 	ret = uncore_types_init(uncore_pci_uncores);
 	if (ret)
-		return ret;
+		goto err;
 
 	uncore_pci_driver->probe = uncore_pci_probe;
 	uncore_pci_driver->remove = uncore_pci_remove;
 
 	ret = pci_register_driver(uncore_pci_driver);
-	if (ret == 0)
-		pcidrv_registered = true;
-	else
-		uncore_types_exit(uncore_pci_uncores);
+	if (ret)
+		goto err;
+
+	pcidrv_registered = true;
+	return 0;
 
+err:
+	uncore_types_exit(uncore_pci_uncores);
+	uncore_pci_uncores = empty_uncore;
 	return ret;
 }
 
@@ -1316,9 +1316,12 @@ static int __init uncore_cpu_init(void)
 
 	ret = uncore_types_init(uncore_msr_uncores);
 	if (ret)
-		return ret;
-
+		goto err;
 	return 0;
+err:
+	uncore_types_exit(uncore_msr_uncores);
+	uncore_msr_uncores = empty_uncore;
+	return ret;
 }
 
 static int __init uncore_pmus_register(void)
-- 
cgit 


From 4f089678d071781851c3b73c41e55a3765b6a5ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:09 +0000
Subject: perf/x86/intel/uncore: Fix error handling

This driver lacks any form of proper error handling. If initialization fails
or hotplug prepare fails, it lets the facility with half initialized stuff
around.

Fix the state and memory leaks in a first step. As a second step we need to
undo the hardware state which is set via uncore_box_init() on some of the
uncore implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.848880559@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 122 +++++++++++++++++++++++++++++------------
 arch/x86/events/intel/uncore.h |  15 ++---
 2 files changed, 94 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 91facdc22d18..25e62037563e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -38,6 +38,16 @@ int uncore_pcibus_to_physid(struct pci_bus *bus)
 	return phys_id;
 }
 
+static void uncore_free_pcibus_map(void)
+{
+	struct pci2phy_map *map, *tmp;
+
+	list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}
+
 struct pci2phy_map *__find_pci2phy_map(int segment)
 {
 	struct pci2phy_map *map, *alloc = NULL;
@@ -760,16 +770,28 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 	}
 
 	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+	if (!ret)
+		pmu->registered = true;
 	return ret;
 }
 
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+	if (!pmu->registered)
+		return;
+	perf_pmu_unregister(&pmu->pmu);
+	pmu->registered = false;
+}
+
 static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
 	int i;
 
 	if (type->pmus) {
-		for (i = 0; i < type->num_boxes; i++)
+		for (i = 0; i < type->num_boxes; i++) {
+			uncore_pmu_unregister(&type->pmus[i]);
 			free_percpu(type->pmus[i].box);
+		}
 		kfree(type->pmus);
 		type->pmus = NULL;
 	}
@@ -856,8 +878,8 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
 	struct intel_uncore_type *type;
-	int phys_id;
 	bool first_box = false;
+	int phys_id, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
 	if (phys_id < 0)
@@ -906,9 +928,18 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	list_add_tail(&box->list, &pmu->box_list);
 	raw_spin_unlock(&uncore_box_lock);
 
-	if (first_box)
-		uncore_pmu_register(pmu);
-	return 0;
+	if (!first_box)
+		return 0;
+
+	ret = uncore_pmu_register(pmu);
+	if (ret) {
+		pci_set_drvdata(pdev, NULL);
+		raw_spin_lock(&uncore_box_lock);
+		list_del(&box->list);
+		raw_spin_unlock(&uncore_box_lock);
+		kfree(box);
+	}
+	return ret;
 }
 
 static void uncore_pci_remove(struct pci_dev *pdev)
@@ -954,7 +985,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	kfree(box);
 
 	if (last_box)
-		perf_pmu_unregister(&pmu->pmu);
+		uncore_pmu_unregister(pmu);
 }
 
 static int __init uncore_pci_init(void)
@@ -1018,6 +1049,7 @@ static int __init uncore_pci_init(void)
 err:
 	uncore_types_exit(uncore_pci_uncores);
 	uncore_pci_uncores = empty_uncore;
+	uncore_free_pcibus_map();
 	return ret;
 }
 
@@ -1027,6 +1059,7 @@ static void __init uncore_pci_exit(void)
 		pcidrv_registered = false;
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
+		uncore_free_pcibus_map();
 	}
 }
 
@@ -1223,8 +1256,7 @@ static int uncore_cpu_notifier(struct notifier_block *self,
 	/* allocate/free data structure for uncore box */
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		uncore_cpu_prepare(cpu, -1);
-		break;
+		return notifier_from_errno(uncore_cpu_prepare(cpu, -1));
 	case CPU_STARTING:
 		uncore_cpu_starting(cpu);
 		break;
@@ -1265,9 +1297,29 @@ static struct notifier_block uncore_cpu_nb = {
 	.priority	= CPU_PRI_PERF + 1,
 };
 
-static void __init uncore_cpu_setup(void *dummy)
+static int __init type_pmu_register(struct intel_uncore_type *type)
 {
-	uncore_cpu_starting(smp_processor_id());
+	int i, ret;
+
+	for (i = 0; i < type->num_boxes; i++) {
+		ret = uncore_pmu_register(&type->pmus[i]);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+	struct intel_uncore_type **types = uncore_msr_uncores;
+	int ret;
+
+	while (*types) {
+		ret = type_pmu_register(*types++);
+		if (ret)
+			return ret;
+	}
+	return 0;
 }
 
 static int __init uncore_cpu_init(void)
@@ -1315,6 +1367,10 @@ static int __init uncore_cpu_init(void)
 	}
 
 	ret = uncore_types_init(uncore_msr_uncores);
+	if (ret)
+		goto err;
+
+	ret = uncore_msr_pmus_register();
 	if (ret)
 		goto err;
 	return 0;
@@ -1324,26 +1380,14 @@ err:
 	return ret;
 }
 
-static int __init uncore_pmus_register(void)
+static void __init uncore_cpu_setup(void *dummy)
 {
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_type *type;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			uncore_pmu_register(pmu);
-		}
-	}
-
-	return 0;
+	uncore_cpu_starting(smp_processor_id());
 }
 
-static void __init uncore_cpumask_init(void)
+static int __init uncore_cpumask_init(void)
 {
-	int cpu;
+	int cpu, ret = 0;
 
 	cpu_notifier_register_begin();
 
@@ -1359,17 +1403,20 @@ static void __init uncore_cpumask_init(void)
 		if (phys_id < 0)
 			continue;
 
-		uncore_cpu_prepare(cpu, phys_id);
+		ret = uncore_cpu_prepare(cpu, phys_id);
+		if (ret)
+			goto out;
 		uncore_event_init_cpu(cpu);
 	}
 	on_each_cpu(uncore_cpu_setup, NULL, 1);
 
 	__register_cpu_notifier(&uncore_cpu_nb);
 
+out:
 	cpu_notifier_register_done();
+	return ret;
 }
 
-
 static int __init intel_uncore_init(void)
 {
 	int ret;
@@ -1382,17 +1429,20 @@ static int __init intel_uncore_init(void)
 
 	ret = uncore_pci_init();
 	if (ret)
-		goto fail;
+		return ret;
 	ret = uncore_cpu_init();
-	if (ret) {
-		uncore_pci_exit();
-		goto fail;
-	}
-	uncore_cpumask_init();
+	if (ret)
+		goto errpci;
+	ret = uncore_cpumask_init();
+	if (ret)
+		goto errcpu;
 
-	uncore_pmus_register();
 	return 0;
-fail:
+
+errcpu:
+	uncore_types_exit(uncore_msr_uncores);
+errpci:
+	uncore_pci_exit();
 	return ret;
 }
 device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 6a1340c7f3cc..a18cc7f5e5a2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -73,13 +73,14 @@ struct intel_uncore_ops {
 };
 
 struct intel_uncore_pmu {
-	struct pmu pmu;
-	char name[UNCORE_PMU_NAME_LEN];
-	int pmu_idx;
-	int func_id;
-	struct intel_uncore_type *type;
-	struct intel_uncore_box ** __percpu box;
-	struct list_head box_list;
+	struct pmu			pmu;
+	char				name[UNCORE_PMU_NAME_LEN];
+	int				pmu_idx;
+	int				func_id;
+	bool				registered;
+	struct intel_uncore_type	*type;
+	struct intel_uncore_box		** __percpu box;
+	struct list_head		box_list;
 };
 
 struct intel_uncore_extra_reg {
-- 
cgit 


From 83f8ebd2eb459b21e8e74d43c75e1be666ec8b97 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:10 +0000
Subject: perf/x86/intel/uncore: Add sanity checks for PCI dev package id

The storage array is size limited, but misses a sanity check

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.929967806@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 25e62037563e..42ea435e5ead 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -882,7 +882,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	int phys_id, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
+	if (phys_id < 0 || phys_id >= UNCORE_SOCKET_MAX)
 		return -ENODEV;
 
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-- 
cgit 


From a46195f1782e94ad07e7485fb15b80db7e60e7aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:11 +0000
Subject: perf/x86/intel/uncore: Clean up hardware on exit

When tearing down the boxes nothing undoes the hardware state which was setup
by box->init_box(). Add a box->exit_box() callback and implement it for the
uncores which have an init_box() callback.

This misses the cleanup in the error exit pathes, but I cannot be bothered to
implement it before cleaning up the rest of the driver, which makes that task
way simpler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.023930023@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c       |  6 +++++-
 arch/x86/events/intel/uncore.h       |  9 +++++++++
 arch/x86/events/intel/uncore_nhmex.c |  6 ++++++
 arch/x86/events/intel/uncore_snb.c   | 13 +++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 42ea435e5ead..c3d1cbbaa917 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -937,6 +937,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		raw_spin_lock(&uncore_box_lock);
 		list_del(&box->list);
 		raw_spin_unlock(&uncore_box_lock);
+		uncore_box_exit(box);
 		kfree(box);
 	}
 	return ret;
@@ -982,6 +983,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	uncore_box_exit(box);
 	kfree(box);
 
 	if (last_box)
@@ -1091,8 +1093,10 @@ static void uncore_cpu_dying(int cpu)
 			pmu = &type->pmus[j];
 			box = *per_cpu_ptr(pmu->box, cpu);
 			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt))
+			if (box && atomic_dec_and_test(&box->refcnt)) {
 				list_add(&box->list, &boxes_to_free);
+				uncore_box_exit(box);
+			}
 		}
 	}
 }
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a18cc7f5e5a2..cf2cb1fd85e9 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -61,6 +61,7 @@ struct intel_uncore_type {
 
 struct intel_uncore_ops {
 	void (*init_box)(struct intel_uncore_box *);
+	void (*exit_box)(struct intel_uncore_box *);
 	void (*disable_box)(struct intel_uncore_box *);
 	void (*enable_box)(struct intel_uncore_box *);
 	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
@@ -306,6 +307,14 @@ static inline void uncore_box_init(struct intel_uncore_box *box)
 	}
 }
 
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+	if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->exit_box)
+			box->pmu->type->ops->exit_box(box);
+	}
+}
+
 static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 {
 	return (box->phys_id < 0);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index e89bf5c536e0..cda569332005 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -201,6 +201,11 @@ static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
 	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
 }
 
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
 static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
 {
 	unsigned msr = uncore_msr_box_ctl(box);
@@ -250,6 +255,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p
 
 #define NHMEX_UNCORE_OPS_COMMON_INIT()				\
 	.init_box	= nhmex_uncore_msr_init_box,		\
+	.exit_box	= nhmex_uncore_msr_exit_box,		\
 	.disable_box	= nhmex_uncore_msr_disable_box,		\
 	.enable_box	= nhmex_uncore_msr_enable_box,		\
 	.disable_event	= nhmex_uncore_msr_disable_event,	\
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 2049d26a7ae6..120e106389f6 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -95,6 +95,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
 	}
 }
 
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0)
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
 static struct uncore_event_desc snb_uncore_events[] = {
 	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
 	{ /* end: all zeroes */ },
@@ -116,6 +122,7 @@ static struct attribute_group snb_uncore_format_group = {
 
 static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.init_box	= snb_uncore_msr_init_box,
+	.exit_box	= snb_uncore_msr_exit_box,
 	.disable_event	= snb_uncore_msr_disable_event,
 	.enable_event	= snb_uncore_msr_enable_event,
 	.read_counter	= uncore_msr_read_counter,
@@ -231,6 +238,11 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
 }
 
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+	iounmap(box->io_addr);
+}
+
 static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
 {}
 
@@ -458,6 +470,7 @@ static struct pmu snb_uncore_imc_pmu = {
 
 static struct intel_uncore_ops snb_uncore_imc_ops = {
 	.init_box	= snb_uncore_imc_init_box,
+	.exit_box	= snb_uncore_imc_exit_box,
 	.enable_box	= snb_uncore_imc_enable_box,
 	.disable_box	= snb_uncore_imc_disable_box,
 	.disable_event	= snb_uncore_imc_disable_event,
-- 
cgit 


From 1229735b290d12003a1362ccfc442c690c93bc3a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:12 +0000
Subject: perf/x86/intel/uncore: Make code more readable

Clean up the code a bit before reworking it completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.204771538@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 95 ++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c3d1cbbaa917..84055f2c36f2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -217,7 +217,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
 	return config;
 }
 
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+				   struct perf_event *event, int idx)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -312,18 +313,19 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
 	box->hrtimer.function = uncore_pmu_hrtimer;
 }
 
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+						 int node)
 {
+	int i, size, numshared = type->num_shared_regs ;
 	struct intel_uncore_box *box;
-	int i, size;
 
-	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+	size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
 
 	box = kzalloc_node(size, GFP_KERNEL, node);
 	if (!box)
 		return NULL;
 
-	for (i = 0; i < type->num_shared_regs; i++)
+	for (i = 0; i < numshared; i++)
 		raw_spin_lock_init(&box->shared_regs[i].lock);
 
 	uncore_pmu_init_hrtimer(box);
@@ -351,7 +353,8 @@ static bool is_uncore_event(struct perf_event *event)
 }
 
 static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+		      bool dogrp)
 {
 	struct perf_event *event;
 	int n, max_count;
@@ -412,7 +415,8 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve
 	return &type->unconstrainted;
 }
 
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+					struct perf_event *event)
 {
 	if (box->pmu->type->ops->put_constraint)
 		box->pmu->type->ops->put_constraint(box, event);
@@ -592,7 +596,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags)
 		if (event == box->event_list[i]) {
 			uncore_put_event_constraint(box, event);
 
-			while (++i < box->n_events)
+			for (++i; i < box->n_events; i++)
 				box->event_list[i - 1] = box->event_list[i];
 
 			--box->n_events;
@@ -801,10 +805,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 
 static void __init uncore_types_exit(struct intel_uncore_type **types)
 {
-	int i;
-
-	for (i = 0; types[i]; i++)
-		uncore_type_exit(types[i]);
+	for (; *types; types++)
+		uncore_type_exit(*types);
 }
 
 static int __init uncore_type_init(struct intel_uncore_type *type)
@@ -908,9 +910,11 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 * some device types. Hence PCI device idx would be 0 for all devices.
 	 * So increment pmu pointer to point to an unused array element.
 	 */
-	if (boot_cpu_data.x86_model == 87)
+	if (boot_cpu_data.x86_model == 87) {
 		while (pmu->func_id >= 0)
 			pmu++;
+	}
+
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
@@ -1170,44 +1174,45 @@ static int uncore_cpu_prepare(int cpu, int phys_id)
 	return 0;
 }
 
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+				   int new_cpu)
 {
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_pmu *pmu = type->pmus;
 	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncores[i]; i++) {
-		type = uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (old_cpu < 0)
-				box = uncore_pmu_to_box(pmu, new_cpu);
-			else
-				box = uncore_pmu_to_box(pmu, old_cpu);
-			if (!box)
-				continue;
+	int i;
 
-			if (old_cpu < 0) {
-				WARN_ON_ONCE(box->cpu != -1);
-				box->cpu = new_cpu;
-				continue;
-			}
+	for (i = 0; i < type->num_boxes; i++, pmu++) {
+		if (old_cpu < 0)
+			box = uncore_pmu_to_box(pmu, new_cpu);
+		else
+			box = uncore_pmu_to_box(pmu, old_cpu);
+		if (!box)
+			continue;
 
-			WARN_ON_ONCE(box->cpu != old_cpu);
-			if (new_cpu >= 0) {
-				uncore_pmu_cancel_hrtimer(box);
-				perf_pmu_migrate_context(&pmu->pmu,
-						old_cpu, new_cpu);
-				box->cpu = new_cpu;
-			} else {
-				box->cpu = -1;
-			}
+		if (old_cpu < 0) {
+			WARN_ON_ONCE(box->cpu != -1);
+			box->cpu = new_cpu;
+			continue;
 		}
+
+		WARN_ON_ONCE(box->cpu != old_cpu);
+		box->cpu = -1;
+		if (new_cpu < 0)
+			continue;
+
+		uncore_pmu_cancel_hrtimer(box);
+		perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+		box->cpu = new_cpu;
 	}
 }
 
+static void uncore_change_context(struct intel_uncore_type **uncores,
+				  int old_cpu, int new_cpu)
+{
+	for (; *uncores; uncores++)
+		uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
 static void uncore_event_exit_cpu(int cpu)
 {
 	int i, phys_id, target;
@@ -1318,8 +1323,8 @@ static int __init uncore_msr_pmus_register(void)
 	struct intel_uncore_type **types = uncore_msr_uncores;
 	int ret;
 
-	while (*types) {
-		ret = type_pmu_register(*types++);
+	for (; *types; types++) {
+		ret = type_pmu_register(*types);
 		if (ret)
 			return ret;
 	}
-- 
cgit 


From 1384c70442b71f882e7e7153df542794d468a15d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:13 +0000
Subject: perf/x86/uncore: Make uncore_pcibus_to_physid() static

No users outside of this file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.285504825@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 2 +-
 arch/x86/events/intel/uncore.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 84055f2c36f2..80ee3d4e7005 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -21,7 +21,7 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
 	EVENT_CONSTRAINT(0, 0, 0);
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
 	int phys_id = -1;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index cf2cb1fd85e9..f118c0d89814 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -125,7 +125,6 @@ struct pci2phy_map {
 	int pbus_to_physid[256];
 };
 
-int uncore_pcibus_to_physid(struct pci_bus *bus);
 struct pci2phy_map *__find_pci2phy_map(int segment);
 
 ssize_t uncore_event_show(struct kobject *kobj,
-- 
cgit 


From 1f2569fac6c6dd5a6e3fba41c183c04273f05a58 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:14 +0000
Subject: perf/x86/intel/uncore: Store box in event->pmu_private

Store the PMU pointer in event->pmu_private, so we can get rid of the
per CPU data storage.

We keep it after converting to per package data, because a CPU to
package lookup will be 3 loads versus one and these usage sites are
in the perf fast path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.460851335@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c     | 15 +--------------
 arch/x86/events/intel/uncore.h     | 12 ++++++++++--
 arch/x86/events/intel/uncore_snb.c |  1 +
 3 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 80ee3d4e7005..220f81ce235d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -92,11 +92,6 @@ ssize_t uncore_event_show(struct kobject *kobj,
 	return sprintf(buf, "%s", event->config);
 }
 
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
 	struct intel_uncore_box *box;
@@ -122,15 +117,6 @@ out:
 	return *per_cpu_ptr(pmu->box, cpu);
 }
 
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-	/*
-	 * perf core schedules event on the basis of cpu, uncore events are
-	 * collected by one of the cpus inside a physical package.
-	 */
-	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
 {
 	u64 count;
@@ -690,6 +676,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
 	if (!box || box->cpu < 0)
 		return -EINVAL;
 	event->cpu = box->cpu;
+	event->pmu_private = box;
 
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index f118c0d89814..479c1e8f2326 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -319,9 +319,17 @@ static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 	return (box->phys_id < 0);
 }
 
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+	return event->pmu_private;
+}
+
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 120e106389f6..96531d2b843f 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -313,6 +313,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	event->cpu = box->cpu;
+	event->pmu_private = box;
 
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
-- 
cgit 


From 1f12e32f4cd5243ae46d8b933181be0d022c6793 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:15 +0000
Subject: x86/topology: Create logical package id

For per package oriented services we must be able to rely on the number of CPU
packages to be within bounds. Create a tracking facility, which

- calculates the number of possible packages depending on nr_cpu_ids after boot

- makes sure that the package id is within the number of possible packages. If
  the apic id is outside we map it to a logical package id if there is enough
  space available.

Provide interfaces for drivers to query the mapping and do translations from
physcial to logical ids.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.541071755@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h |   2 +
 arch/x86/include/asm/topology.h  |  11 +++++
 arch/x86/kernel/apic/apic.c      |  14 ++++++
 arch/x86/kernel/cpu/common.c     |   2 +
 arch/x86/kernel/cpu/intel.c      |  13 +++++
 arch/x86/kernel/smpboot.c        | 100 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 142 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 20c11d1aa4cc..813384ef811a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -129,6 +129,8 @@ struct cpuinfo_x86 {
 	u16			booted_cores;
 	/* Physical processor id: */
 	u16			phys_proc_id;
+	/* Logical processor id: */
+	u16			logical_proc_id;
 	/* Core id: */
 	u16			cpu_core_id;
 	/* Compute unit id */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0fb46482dfde..7f991bd5031b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
 
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#define topology_logical_package_id(cpu)	(cpu_data(cpu).logical_proc_id)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages()			(__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages()			(1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8a5cddac7d44..531b9611c51d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2077,6 +2077,20 @@ int generic_processor_info(int apicid, int version)
 	} else
 		cpu = cpumask_next_zero(-1, cpu_present_mask);
 
+	/*
+	 * This can happen on physical hotplug. The sanity check at boot time
+	 * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+	 * established.
+	 */
+	if (topology_update_package_map(apicid, cpu) < 0) {
+		int thiscpu = max + disabled_cpus;
+
+		pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+			   thiscpu, apicid);
+		disabled_cpus++;
+		return -ENOSPC;
+	}
+
 	/*
 	 * Validate version
 	 */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68a80e9b67fc..81cf716f6f97 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -975,6 +975,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
+	/* The boot/hotplug time assigment got cleared, restore it */
+	c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 05b9211ea0f7..38766c2b5b00 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		pr_info("Disabling PGE capability bit\n");
 		setup_clear_cpu_cap(X86_FEATURE_PGE);
 	}
+
+	if (c->cpuid_level >= 0x00000001) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+		/*
+		 * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+		 * apicids which are reserved per package. Store the resulting
+		 * shift value for the package management code.
+		 */
+		if (edx & (1U << 28))
+			c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f77b3c1..3bf1e0b5f827 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map  __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -251,6 +259,97 @@ static void notrace start_secondary(void *unused)
 	cpu_startup_entry(CPUHP_ONLINE);
 }
 
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+	unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+	/* Called from early boot ? */
+	if (!physical_package_map)
+		return 0;
+
+	if (pkg >= max_physical_pkg_id)
+		return -EINVAL;
+
+	/* Set the logical package id */
+	if (test_and_set_bit(pkg, physical_package_map))
+		goto found;
+
+	if (pkg < __max_logical_packages) {
+		set_bit(pkg, logical_package_map);
+		physical_to_logical_pkg[pkg] = pkg;
+		goto found;
+	}
+	new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+	if (new >= __max_logical_packages) {
+		physical_to_logical_pkg[pkg] = -1;
+		pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+			apicid, pkg);
+		return -ENOSPC;
+	}
+	set_bit(new, logical_package_map);
+	pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+		apicid, pkg, new);
+	physical_to_logical_pkg[pkg] = new;
+
+found:
+	cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+	return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+	if (phys_pkg >= max_physical_pkg_id)
+		return -1;
+	return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+	unsigned int ncpus, cpu;
+	size_t size;
+
+	/*
+	 * Today neither Intel nor AMD support heterogenous systems. That
+	 * might change in the future....
+	 */
+	ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+	/*
+	 * Possibly larger than what we need as the number of apic ids per
+	 * package can be smaller than the actual used apic ids.
+	 */
+	max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+	size = max_physical_pkg_id * sizeof(unsigned int);
+	physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+	memset(physical_to_logical_pkg, 0xff, size);
+	size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+	physical_package_map = kzalloc(size, GFP_KERNEL);
+	size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+	logical_package_map = kzalloc(size, GFP_KERNEL);
+
+	pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+	for_each_present_cpu(cpu) {
+		unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+		if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+			continue;
+		if (!topology_update_package_map(apicid, cpu))
+			continue;
+		pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+		per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+		set_cpu_possible(cpu, false);
+		set_cpu_present(cpu, false);
+	}
+}
+
 void __init smp_store_boot_cpu_info(void)
 {
 	int id = 0; /* CPU 0 */
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
 
 	*c = boot_cpu_data;
 	c->cpu_index = id;
+	smp_init_package_map();
 }
 
 /*
-- 
cgit 


From cf6d445f68974d0b15a14cf6021be38a91f2b5d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:16 +0000
Subject: perf/x86/uncore: Track packages, not per CPU data

Uncore is a per package facility, but the code tries to mimick a per CPU
facility with completely convoluted constructs.

Simplify the whole machinery by tracking per package information. While at it,
avoid the kfree/alloc dance when a CPU goes offline and online again. There is
no point in freeing the box after it was allocated. We just keep proper
refcounting and the first CPU which comes online in a package does the
initialization/activation of the box.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.622258933@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c       | 422 +++++++++++++++--------------------
 arch/x86/events/intel/uncore.h       |  18 +-
 arch/x86/events/intel/uncore_snbep.c |  10 +-
 3 files changed, 198 insertions(+), 252 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 220f81ce235d..096fc0de1bcd 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -9,9 +9,9 @@ struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
 
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
 
@@ -94,27 +94,7 @@ ssize_t uncore_event_show(struct kobject *kobj,
 
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
-	struct intel_uncore_box *box;
-
-	box = *per_cpu_ptr(pmu->box, cpu);
-	if (box)
-		return box;
-
-	raw_spin_lock(&uncore_box_lock);
-	/* Recheck in lock to handle races. */
-	if (*per_cpu_ptr(pmu->box, cpu))
-		goto out;
-	list_for_each_entry(box, &pmu->box_list, list) {
-		if (box->phys_id == topology_physical_package_id(cpu)) {
-			atomic_inc(&box->refcnt);
-			*per_cpu_ptr(pmu->box, cpu) = box;
-			break;
-		}
-	}
-out:
-	raw_spin_unlock(&uncore_box_lock);
-
-	return *per_cpu_ptr(pmu->box, cpu);
+	return pmu->boxes[topology_logical_package_id(cpu)];
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -315,9 +295,9 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 		raw_spin_lock_init(&box->shared_regs[i].lock);
 
 	uncore_pmu_init_hrtimer(box);
-	atomic_set(&box->refcnt, 1);
 	box->cpu = -1;
-	box->phys_id = -1;
+	box->pci_phys_id = -1;
+	box->pkgid = -1;
 
 	/* set default hrtimer timeout */
 	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
@@ -774,14 +754,24 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+	int pkg;
+
+	for (pkg = 0; pkg < max_packages; pkg++)
+		kfree(pmu->boxes[pkg]);
+	kfree(pmu->boxes);
+}
+
 static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
+	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
 
-	if (type->pmus) {
-		for (i = 0; i < type->num_boxes; i++) {
-			uncore_pmu_unregister(&type->pmus[i]);
-			free_percpu(type->pmus[i].box);
+	if (pmu) {
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			uncore_pmu_unregister(pmu);
+			uncore_free_boxes(pmu);
 		}
 		kfree(type->pmus);
 		type->pmus = NULL;
@@ -796,37 +786,36 @@ static void __init uncore_types_exit(struct intel_uncore_type **types)
 		uncore_type_exit(*types);
 }
 
-static int __init uncore_type_init(struct intel_uncore_type *type)
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 {
 	struct intel_uncore_pmu *pmus;
 	struct attribute_group *attr_group;
 	struct attribute **attrs;
+	size_t size;
 	int i, j;
 
 	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
 	if (!pmus)
 		return -ENOMEM;
 
-	type->pmus = pmus;
-
-	type->unconstrainted = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-				0, type->num_counters, 0, 0);
+	size = max_packages * sizeof(struct intel_uncore_box *);
 
 	for (i = 0; i < type->num_boxes; i++) {
-		pmus[i].func_id = -1;
-		pmus[i].pmu_idx = i;
-		pmus[i].type = type;
-		INIT_LIST_HEAD(&pmus[i].box_list);
-		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-		if (!pmus[i].box)
+		pmus[i].func_id	= setid ? i : -1;
+		pmus[i].pmu_idx	= i;
+		pmus[i].type	= type;
+		pmus[i].boxes	= kzalloc(size, GFP_KERNEL);
+		if (!pmus[i].boxes)
 			return -ENOMEM;
 	}
 
+	type->pmus = pmus;
+	type->unconstrainted = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+				0, type->num_counters, 0, 0);
+
 	if (type->event_descs) {
-		i = 0;
-		while (type->event_descs[i].attr.attr.name)
-			i++;
+		for (i = 0; type->event_descs[i].attr.attr.name; i++);
 
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
@@ -847,12 +836,13 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 	return 0;
 }
 
-static int __init uncore_types_init(struct intel_uncore_type **types)
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
 {
-	int i, ret;
+	int ret;
 
-	for (i = 0; types[i]; i++) {
-		ret = uncore_type_init(types[i]);
+	for (; *types; types++) {
+		ret = uncore_type_init(*types, setid);
 		if (ret)
 			return ret;
 	}
@@ -864,28 +854,28 @@ static int __init uncore_types_init(struct intel_uncore_type **types)
  */
 static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
+	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	struct intel_uncore_type *type;
-	bool first_box = false;
-	int phys_id, ret;
+	int phys_id, pkg, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0 || phys_id >= UNCORE_SOCKET_MAX)
+	if (phys_id < 0)
 		return -ENODEV;
 
+	pkg = topology_phys_to_logical_pkg(phys_id);
+	if (WARN_ON_ONCE(pkg < 0))
+		return -EINVAL;
+
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
 		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-		uncore_extra_pci_dev[phys_id][idx] = pdev;
+
+		uncore_extra_pci_dev[pkg].dev[idx] = pdev;
 		pci_set_drvdata(pdev, NULL);
 		return 0;
 	}
 
 	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-	box = uncore_alloc_box(type, NUMA_NO_NODE);
-	if (!box)
-		return -ENOMEM;
-
 	/*
 	 * for performance monitoring unit with multiple boxes,
 	 * each box has a different function id.
@@ -902,32 +892,35 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 			pmu++;
 	}
 
+	if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+		return -EINVAL;
+
+	box = uncore_alloc_box(type, NUMA_NO_NODE);
+	if (!box)
+		return -ENOMEM;
+
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
-	box->phys_id = phys_id;
+	atomic_inc(&box->refcnt);
+	box->pci_phys_id = phys_id;
+	box->pkgid = pkg;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
 	uncore_box_init(box);
 	pci_set_drvdata(pdev, box);
 
-	raw_spin_lock(&uncore_box_lock);
-	if (list_empty(&pmu->box_list))
-		first_box = true;
-	list_add_tail(&box->list, &pmu->box_list);
-	raw_spin_unlock(&uncore_box_lock);
-
-	if (!first_box)
+	pmu->boxes[pkg] = box;
+	if (atomic_inc_return(&pmu->activeboxes) > 1)
 		return 0;
 
+	/* First active box registers the pmu */
 	ret = uncore_pmu_register(pmu);
 	if (ret) {
 		pci_set_drvdata(pdev, NULL);
-		raw_spin_lock(&uncore_box_lock);
-		list_del(&box->list);
-		raw_spin_unlock(&uncore_box_lock);
+		pmu->boxes[pkg] = NULL;
 		uncore_box_exit(box);
 		kfree(box);
 	}
@@ -938,15 +931,16 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box = pci_get_drvdata(pdev);
 	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id;
-	bool last_box = false;
+	int i, phys_id, pkg;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	pkg = topology_phys_to_logical_pkg(phys_id);
+
 	box = pci_get_drvdata(pdev);
 	if (!box) {
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-				uncore_extra_pci_dev[phys_id][i] = NULL;
+			if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+				uncore_extra_pci_dev[pkg].dev[i] = NULL;
 				break;
 			}
 		}
@@ -955,34 +949,20 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->phys_id))
+	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
 		return;
 
 	pci_set_drvdata(pdev, NULL);
-
-	raw_spin_lock(&uncore_box_lock);
-	list_del(&box->list);
-	if (list_empty(&pmu->box_list))
-		last_box = true;
-	raw_spin_unlock(&uncore_box_lock);
-
-	for_each_possible_cpu(cpu) {
-		if (*per_cpu_ptr(pmu->box, cpu) == box) {
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			atomic_dec(&box->refcnt);
-		}
-	}
-
-	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	pmu->boxes[pkg] = NULL;
+	if (atomic_dec_return(&pmu->activeboxes) == 0)
+		uncore_pmu_unregister(pmu);
 	uncore_box_exit(box);
 	kfree(box);
-
-	if (last_box)
-		uncore_pmu_unregister(pmu);
 }
 
 static int __init uncore_pci_init(void)
 {
+	size_t size;
 	int ret;
 
 	switch (boot_cpu_data.x86_model) {
@@ -1025,24 +1005,34 @@ static int __init uncore_pci_init(void)
 	if (ret)
 		return ret;
 
-	ret = uncore_types_init(uncore_pci_uncores);
-	if (ret)
+	size = max_packages * sizeof(struct pci_extra_dev);
+	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+	if (!uncore_extra_pci_dev) {
+		ret = -ENOMEM;
 		goto err;
+	}
+
+	ret = uncore_types_init(uncore_pci_uncores, false);
+	if (ret)
+		goto errtype;
 
 	uncore_pci_driver->probe = uncore_pci_probe;
 	uncore_pci_driver->remove = uncore_pci_remove;
 
 	ret = pci_register_driver(uncore_pci_driver);
 	if (ret)
-		goto err;
+		goto errtype;
 
 	pcidrv_registered = true;
 	return 0;
 
-err:
+errtype:
 	uncore_types_exit(uncore_pci_uncores);
-	uncore_pci_uncores = empty_uncore;
+	kfree(uncore_extra_pci_dev);
+	uncore_extra_pci_dev = NULL;
 	uncore_free_pcibus_map();
+err:
+	uncore_pci_uncores = empty_uncore;
 	return ret;
 }
 
@@ -1052,110 +1042,81 @@ static void __init uncore_pci_exit(void)
 		pcidrv_registered = false;
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
+		kfree(uncore_extra_pci_dev);
 		uncore_free_pcibus_map();
 	}
 }
 
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-	struct intel_uncore_box *box;
-
-	while (!list_empty(&boxes_to_free)) {
-		box = list_entry(boxes_to_free.next,
-				 struct intel_uncore_box, list);
-		list_del(&box->list);
-		kfree(box);
-	}
-}
-
 static void uncore_cpu_dying(int cpu)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	int i, j;
+	int i, pkg;
 
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt)) {
-				list_add(&box->list, &boxes_to_free);
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (box && atomic_dec_return(&box->refcnt) == 0)
 				uncore_box_exit(box);
-			}
 		}
 	}
 }
 
-static int uncore_cpu_starting(int cpu)
+static void uncore_cpu_starting(int cpu, bool init)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box, *exist;
-	int i, j, k, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			/* called by uncore_cpu_init? */
-			if (box && box->phys_id >= 0) {
-				uncore_box_init(box);
-				continue;
-			}
+	struct intel_uncore_box *box;
+	int i, pkg, ncpus = 1;
 
-			for_each_online_cpu(k) {
-				exist = *per_cpu_ptr(pmu->box, k);
-				if (exist && exist->phys_id == phys_id) {
-					atomic_inc(&exist->refcnt);
-					*per_cpu_ptr(pmu->box, cpu) = exist;
-					if (box) {
-						list_add(&box->list,
-							 &boxes_to_free);
-						box = NULL;
-					}
-					break;
-				}
-			}
+	if (init) {
+		/*
+		 * On init we get the number of online cpus in the package
+		 * and set refcount for all of them.
+		 */
+		ncpus = cpumask_weight(topology_core_cpumask(cpu));
+	}
 
-			if (box) {
-				box->phys_id = phys_id;
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (!box)
+				continue;
+			/* The first cpu on a package activates the box */
+			if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
 				uncore_box_init(box);
-			}
 		}
 	}
-	return 0;
 }
 
-static int uncore_cpu_prepare(int cpu, int phys_id)
+static int uncore_cpu_prepare(int cpu)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (pmu->func_id < 0)
-				pmu->func_id = j;
+	int i, pkg;
 
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			if (pmu->boxes[pkg])
+				continue;
+			/* First cpu of a package allocates the box */
 			box = uncore_alloc_box(type, cpu_to_node(cpu));
 			if (!box)
 				return -ENOMEM;
-
 			box->pmu = pmu;
-			box->phys_id = phys_id;
-			*per_cpu_ptr(pmu->box, cpu) = box;
+			box->pkgid = pkg;
+			pmu->boxes[pkg] = box;
 		}
 	}
 	return 0;
@@ -1166,13 +1127,11 @@ static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
 {
 	struct intel_uncore_pmu *pmu = type->pmus;
 	struct intel_uncore_box *box;
-	int i;
+	int i, pkg;
 
+	pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
 	for (i = 0; i < type->num_boxes; i++, pmu++) {
-		if (old_cpu < 0)
-			box = uncore_pmu_to_box(pmu, new_cpu);
-		else
-			box = uncore_pmu_to_box(pmu, old_cpu);
+		box = pmu->boxes[pkg];
 		if (!box)
 			continue;
 
@@ -1202,27 +1161,20 @@ static void uncore_change_context(struct intel_uncore_type **uncores,
 
 static void uncore_event_exit_cpu(int cpu)
 {
-	int i, phys_id, target;
+	int target;
 
-	/* if exiting cpu is used for collecting uncore events */
+	/* Check if exiting cpu is used for collecting uncore events */
 	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
 		return;
 
-	/* find a new cpu to collect uncore events */
-	phys_id = topology_physical_package_id(cpu);
-	target = -1;
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
+	/* Find a new cpu to collect uncore events */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-	/* migrate uncore events to the new cpu */
-	if (target >= 0)
+	/* Migrate uncore events to the new target */
+	if (target < nr_cpu_ids)
 		cpumask_set_cpu(target, &uncore_cpu_mask);
+	else
+		target = -1;
 
 	uncore_change_context(uncore_msr_uncores, cpu, target);
 	uncore_change_context(uncore_pci_uncores, cpu, target);
@@ -1230,13 +1182,15 @@ static void uncore_event_exit_cpu(int cpu)
 
 static void uncore_event_init_cpu(int cpu)
 {
-	int i, phys_id;
+	int target;
 
-	phys_id = topology_physical_package_id(cpu);
-	for_each_cpu(i, &uncore_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
+	/*
+	 * Check if there is an online cpu in the package
+	 * which collects uncore events already.
+	 */
+	target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return;
 
 	cpumask_set_cpu(cpu, &uncore_cpu_mask);
 
@@ -1249,38 +1203,25 @@ static int uncore_cpu_notifier(struct notifier_block *self,
 {
 	unsigned int cpu = (long)hcpu;
 
-	/* allocate/free data structure for uncore box */
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		return notifier_from_errno(uncore_cpu_prepare(cpu, -1));
+		return notifier_from_errno(uncore_cpu_prepare(cpu));
+
 	case CPU_STARTING:
-		uncore_cpu_starting(cpu);
+		uncore_cpu_starting(cpu, false);
+	case CPU_DOWN_FAILED:
+		uncore_event_init_cpu(cpu);
 		break;
+
 	case CPU_UP_CANCELED:
 	case CPU_DYING:
 		uncore_cpu_dying(cpu);
 		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		uncore_kfree_boxes();
-		break;
-	default:
-		break;
-	}
 
-	/* select the cpu that collects uncore events */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-	case CPU_STARTING:
-		uncore_event_init_cpu(cpu);
-		break;
 	case CPU_DOWN_PREPARE:
 		uncore_event_exit_cpu(cpu);
 		break;
-	default:
-		break;
 	}
-
 	return NOTIFY_OK;
 }
 
@@ -1362,7 +1303,7 @@ static int __init uncore_cpu_init(void)
 		return 0;
 	}
 
-	ret = uncore_types_init(uncore_msr_uncores);
+	ret = uncore_types_init(uncore_msr_uncores, true);
 	if (ret)
 		goto err;
 
@@ -1378,39 +1319,34 @@ err:
 
 static void __init uncore_cpu_setup(void *dummy)
 {
-	uncore_cpu_starting(smp_processor_id());
+	uncore_cpu_starting(smp_processor_id(), true);
 }
 
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
+
 static int __init uncore_cpumask_init(void)
 {
-	int cpu, ret = 0;
-
-	cpu_notifier_register_begin();
+	unsigned int cpu;
 
 	for_each_online_cpu(cpu) {
-		int i, phys_id = topology_physical_package_id(cpu);
+		unsigned int pkg = topology_logical_package_id(cpu);
+		int ret;
 
-		for_each_cpu(i, &uncore_cpu_mask) {
-			if (phys_id == topology_physical_package_id(i)) {
-				phys_id = -1;
-				break;
-			}
-		}
-		if (phys_id < 0)
+		if (test_and_set_bit(pkg, packages))
 			continue;
-
-		ret = uncore_cpu_prepare(cpu, phys_id);
+		/*
+		 * The first online cpu of each package takes the refcounts
+		 * for all other online cpus in that package.
+		 */
+		ret = uncore_cpu_prepare(cpu);
 		if (ret)
-			goto out;
+			return ret;
 		uncore_event_init_cpu(cpu);
+		smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
 	}
-	on_each_cpu(uncore_cpu_setup, NULL, 1);
-
 	__register_cpu_notifier(&uncore_cpu_nb);
-
-out:
-	cpu_notifier_register_done();
-	return ret;
+	return 0;
 }
 
 static int __init intel_uncore_init(void)
@@ -1423,22 +1359,26 @@ static int __init intel_uncore_init(void)
 	if (cpu_has_hypervisor)
 		return -ENODEV;
 
+	max_packages = topology_max_packages();
+
 	ret = uncore_pci_init();
 	if (ret)
 		return ret;
 	ret = uncore_cpu_init();
 	if (ret)
-		goto errpci;
+		goto err;
+
+	cpu_notifier_register_begin();
 	ret = uncore_cpumask_init();
 	if (ret)
-		goto errcpu;
-
+		goto err;
+	cpu_notifier_register_done();
 	return 0;
 
-errcpu:
+err:
 	uncore_types_exit(uncore_msr_uncores);
-errpci:
 	uncore_pci_exit();
+	cpu_notifier_register_done();
 	return ret;
 }
 device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 479c1e8f2326..01c67935bb28 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -19,11 +19,12 @@
 #define UNCORE_EXTRA_PCI_DEV		0xff
 #define UNCORE_EXTRA_PCI_DEV_MAX	3
 
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX		8
-
 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
 
+struct pci_extra_dev {
+	struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
 struct intel_uncore_ops;
 struct intel_uncore_pmu;
 struct intel_uncore_box;
@@ -79,9 +80,9 @@ struct intel_uncore_pmu {
 	int				pmu_idx;
 	int				func_id;
 	bool				registered;
+	atomic_t			activeboxes;
 	struct intel_uncore_type	*type;
-	struct intel_uncore_box		** __percpu box;
-	struct list_head		box_list;
+	struct intel_uncore_box		**boxes;
 };
 
 struct intel_uncore_extra_reg {
@@ -91,7 +92,8 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int phys_id;
+	int pci_phys_id;
+	int pkgid;
 	int n_active;	/* number of active events */
 	int n_events;
 	int cpu;	/* cpu to collect events */
@@ -316,7 +318,7 @@ static inline void uncore_box_exit(struct intel_uncore_box *box)
 
 static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 {
-	return (box->phys_id < 0);
+	return (box->pkgid < 0);
 }
 
 static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
@@ -345,7 +347,7 @@ extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
 extern raw_spinlock_t pci2phy_map_lock;
 extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct pci_extra_dev *uncore_extra_pci_dev;
 extern struct event_constraint uncore_constraint_empty;
 
 /* perf_event_intel_uncore_snb.c */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index d967fcc9d5fb..93f6bd9bf761 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -986,7 +986,9 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve
 
 	if (reg1->idx != EXTRA_REG_NONE) {
 		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+		int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+		struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
 		if (filter_pdev) {
 			pci_write_config_dword(filter_pdev, reg1->reg,
 						(u32)reg1->config);
@@ -2520,14 +2522,16 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
 
 void hswep_uncore_cpu_init(void)
 {
+	int pkg = topology_phys_to_logical_pkg(0);
+
 	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 
 	/* Detect 6-8 core systems with only two SBOXes */
-	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+	if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
 		u32 capid4;
 
-		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+		pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
 				      0x94, &capid4);
 		if (((capid4 >> 6) & 0x3) == 0)
 			hswep_uncore_sbox.num_boxes = 2;
-- 
cgit 


From 7b672d6433f0f31c95be74b649090cd8b3a6388d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:17 +0000
Subject: perf/x86/intel/uncore: Clear all hardware state on exit

The only missing bit is to completely clear the hardware state on failure
exit. This is now a pretty simple exercise.

Undo the box->init_box() setup on all packages which have been initialized so
far.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.702452407@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 096fc0de1bcd..5ad8e52cbc36 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -754,6 +754,30 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+	struct intel_uncore_pmu *pmu = type->pmus;
+	struct intel_uncore_box *box;
+	int i, pkg;
+
+	if (pmu) {
+		pkg = topology_physical_package_id(cpu);
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (box)
+				uncore_box_exit(box);
+		}
+	}
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+	struct intel_uncore_type **types;
+
+	for (types = uncore_msr_uncores; *types; types++)
+		__uncore_exit_boxes(*types++, smp_processor_id());
+}
+
 static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int pkg;
@@ -1376,6 +1400,8 @@ static int __init intel_uncore_init(void)
 	return 0;
 
 err:
+	/* Undo box->init_box() */
+	on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_pci_exit();
 	cpu_notifier_register_done();
-- 
cgit 


From 5485592c17954b2a91fbd12980af596dc666079d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:17 +0000
Subject: perf/x86/intel/uncore: Make PCI and MSR uncore independent

Andi wanted to do this before, but the patch fell down the cracks. Implement
it with the proper error handling.

Requested-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.799159968@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 5ad8e52cbc36..7012d18bb293 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1023,7 +1023,7 @@ static int __init uncore_pci_init(void)
 		ret = skl_uncore_pci_init();
 		break;
 	default:
-		return 0;
+		return -ENODEV;
 	}
 
 	if (ret)
@@ -1324,7 +1324,7 @@ static int __init uncore_cpu_init(void)
 		knl_uncore_cpu_init();
 		break;
 	default:
-		return 0;
+		return -ENODEV;
 	}
 
 	ret = uncore_types_init(uncore_msr_uncores, true);
@@ -1349,7 +1349,7 @@ static void __init uncore_cpu_setup(void *dummy)
 /* Lazy to avoid allocation of a few bytes for the normal case */
 static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
 
-static int __init uncore_cpumask_init(void)
+static int __init uncore_cpumask_init(bool msr)
 {
 	unsigned int cpu;
 
@@ -1360,12 +1360,15 @@ static int __init uncore_cpumask_init(void)
 		if (test_and_set_bit(pkg, packages))
 			continue;
 		/*
-		 * The first online cpu of each package takes the refcounts
-		 * for all other online cpus in that package.
+		 * The first online cpu of each package allocates and takes
+		 * the refcounts for all other online cpus in that package.
+		 * If msrs are not enabled no allocation is required.
 		 */
-		ret = uncore_cpu_prepare(cpu);
-		if (ret)
-			return ret;
+		if (msr) {
+			ret = uncore_cpu_prepare(cpu);
+			if (ret)
+				return ret;
+		}
 		uncore_event_init_cpu(cpu);
 		smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
 	}
@@ -1375,7 +1378,7 @@ static int __init uncore_cpumask_init(void)
 
 static int __init intel_uncore_init(void)
 {
-	int ret;
+	int pret, cret, ret;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 		return -ENODEV;
@@ -1385,15 +1388,14 @@ static int __init intel_uncore_init(void)
 
 	max_packages = topology_max_packages();
 
-	ret = uncore_pci_init();
-	if (ret)
-		return ret;
-	ret = uncore_cpu_init();
-	if (ret)
-		goto err;
+	pret = uncore_pci_init();
+	cret = uncore_cpu_init();
+
+	if (cret && pret)
+		return -ENODEV;
 
 	cpu_notifier_register_begin();
-	ret = uncore_cpumask_init();
+	ret = uncore_cpumask_init(!cret);
 	if (ret)
 		goto err;
 	cpu_notifier_register_done();
-- 
cgit 


From 827db839cd051cfde5a618fb4427e86dc91bc9aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:20 +0000
Subject: perf/x86/intel/cqm: Get rid of the silly for_each_cpu() lookups

CQM is a strict per package facility. Use the proper cpumasks to lookup the
readers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.054916179@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 1b064c430140..93cb412a5579 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1244,15 +1244,12 @@ static struct pmu intel_cqm_pmu = {
 
 static inline void cqm_pick_event_reader(int cpu)
 {
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
+	int reader;
 
-	for_each_cpu(i, &cqm_cpumask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;	/* already got reader for this socket */
-	}
-
-	cpumask_set_cpu(cpu, &cqm_cpumask);
+	/* First online cpu in package becomes the reader */
+	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+	if (reader >= nr_cpu_ids)
+		cpumask_set_cpu(cpu, &cqm_cpumask);
 }
 
 static void intel_cqm_cpu_starting(unsigned int cpu)
@@ -1270,24 +1267,17 @@ static void intel_cqm_cpu_starting(unsigned int cpu)
 
 static void intel_cqm_cpu_exit(unsigned int cpu)
 {
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
+	int target;
 
-	/*
-	 * Is @cpu a designated cqm reader?
-	 */
+	/* Is @cpu the current cqm reader for this package ? */
 	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
 		return;
 
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
+	/* Find another online reader in this package */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-		if (phys_id == topology_physical_package_id(i)) {
-			cpumask_set_cpu(i, &cqm_cpumask);
-			break;
-		}
-	}
+	if (target < nr_cpu_ids)
+		cpumask_set_cpu(target, &cqm_cpumask);
 }
 
 static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-- 
cgit 


From 4d120c535d638a952e644817ba3cbef30deedb2b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:20 +0000
Subject: perf/x86/intel/rapl: Make Knights Landings support functional

The Knights Landings support added the events and the detection case, but then
returns 0 without actually initializing the driver.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Fixes: 3a2a7797326a4 "perf/x86/intel/rapl: Add support for Knights Landing (KNL)"
Link: http://lkml.kernel.org/r/20160222221012.149331888@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 580f504ec27c..536f0cebba8f 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -731,7 +731,7 @@ static int __init rapl_pmu_init(void)
 		rapl_add_quirk(rapl_hsw_server_quirk);
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
+		break;
 	default:
 		/* unsupported */
 		return 0;
-- 
cgit 


From 55f2890f0726fe4a1f41a3a0e72ca1a263f095c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:21 +0000
Subject: perf/x86/intel/rapl: Add proper error handling

Like uncore the rapl driver lacks error handling. It leaks memory and leaves
the hotplug notifier registered.

Add the proper error checks, cleanup the memory and register the hotplug
notifier only on success.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.231222076@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 536f0cebba8f..98b04d202760 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -686,6 +686,14 @@ static int rapl_check_hw_unit(void)
 	return 0;
 }
 
+static void __init cleanup_rapl_pmus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		kfree(per_cpu(rapl_pmu, cpu));
+}
+
 static const struct x86_cpu_id rapl_cpu_match[] = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
@@ -702,7 +710,7 @@ static int __init rapl_pmu_init(void)
 	 * check for Intel processor family 6
 	 */
 	if (!x86_match_cpu(rapl_cpu_match))
-		return 0;
+		return -ENODEV;
 
 	/* check supported CPU */
 	switch (boot_cpu_data.x86_model) {
@@ -734,8 +742,9 @@ static int __init rapl_pmu_init(void)
 		break;
 	default:
 		/* unsupported */
-		return 0;
+		return -ENODEV;
 	}
+
 	ret = rapl_check_hw_unit();
 	if (ret)
 		return ret;
@@ -743,6 +752,7 @@ static int __init rapl_pmu_init(void)
 	/* run cpu model quirks */
 	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
 		quirk->func();
+
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
@@ -752,15 +762,14 @@ static int __init rapl_pmu_init(void)
 		rapl_cpu_init(cpu);
 	}
 
-	__perf_cpu_notifier(rapl_cpu_notifier);
-
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 	if (WARN_ON(ret)) {
 		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-		cpu_notifier_register_done();
-		return -1;
+		goto out;
 	}
 
+	__perf_cpu_notifier(rapl_cpu_notifier);
+
 	pmu = __this_cpu_read(rapl_pmu);
 
 	pr_info("RAPL PMU detected,"
@@ -775,9 +784,13 @@ static int __init rapl_pmu_init(void)
 				rapl_domain_names[i], rapl_hw_unit[i]);
 		}
 	}
-out:
-	cpu_notifier_register_done();
 
+	cpu_notifier_register_done();
 	return 0;
+
+out:
+	cleanup_rapl_pmus();
+	cpu_notifier_register_done();
+	return ret;
 }
 device_initcall(rapl_pmu_init);
-- 
cgit 


From b8b3319a471b2df35ae0a8fe94223638468e9ca4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:22 +0000
Subject: perf/x86/intel/rapl: Sanitize the quirk handling

There is no point in having a quirk machinery for a single possible
function. Get rid of it and move the quirk to a place where it actually
makes sense.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.311639465@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 49 +++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 98b04d202760..976558a5f5d4 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -133,7 +133,6 @@ static int rapl_cntr_mask;
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 
-static struct x86_pmu_quirk *rapl_quirks;
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
 	u64 raw;
@@ -141,15 +140,6 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 	return raw;
 }
 
-#define rapl_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = rapl_quirks;					\
-	rapl_quirks = &__quirk;						\
-} while (0)
-
 static inline u64 rapl_scale(u64 v, int cfg)
 {
 	if (cfg > NR_RAPL_DOMAINS) {
@@ -564,17 +554,6 @@ static void rapl_cpu_init(int cpu)
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
 }
 
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
 static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
@@ -672,7 +651,18 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static int rapl_check_hw_unit(void)
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
+static int rapl_check_hw_unit(void (*quirk)(void))
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
@@ -683,6 +673,9 @@ static int rapl_check_hw_unit(void)
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
+	/* Apply cpu model quirk */
+	if (quirk)
+		quirk();
 	return 0;
 }
 
@@ -701,9 +694,9 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 
 static int __init rapl_pmu_init(void)
 {
+	void (*quirk)(void) = NULL;
 	struct rapl_pmu *pmu;
 	int cpu, ret;
-	struct x86_pmu_quirk *quirk;
 	int i;
 
 	/*
@@ -720,7 +713,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
 	case 63: /* Haswell-Server */
-		rapl_add_quirk(rapl_hsw_server_quirk);
+		quirk = rapl_hsw_server_quirk;
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
@@ -736,7 +729,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
 	case 87: /* Knights Landing */
-		rapl_add_quirk(rapl_hsw_server_quirk);
+		quirk = rapl_hsw_server_quirk;
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
@@ -745,14 +738,10 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 	}
 
-	ret = rapl_check_hw_unit();
+	ret = rapl_check_hw_unit(quirk);
 	if (ret)
 		return ret;
 
-	/* run cpu model quirks */
-	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-		quirk->func();
-
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
-- 
cgit 


From 75c7003fbf419f05215d4ca09bf3964d0b1c99e9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:22 +0000
Subject: perf/x86/intel/rapl: Calculate timing once

No point in doing the same calculation over and over. Do it once in
rapl_check_hw_unit().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.409238136@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 976558a5f5d4..b1d4a2f1feeb 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -129,6 +129,7 @@ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
+static u64 rapl_timer_ms;
 
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
@@ -558,7 +559,6 @@ static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 	int phys_id = topology_physical_package_id(cpu);
-	u64 ms;
 
 	if (pmu)
 		return 0;
@@ -575,19 +575,7 @@ static int rapl_cpu_prepare(int cpu)
 
 	pmu->pmu = &rapl_pmu_class;
 
-	/*
-	 * use reference of 200W for scaling the timeout
-	 * to avoid missing counter overflows.
-	 * 200W = 200 Joules/sec
-	 * divide interval by 2 to avoid lockstep (2 * 100)
-	 * if hw unit is 32, then we use 2 ms 1/200/2
-	 */
-	if (rapl_hw_unit[0] < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-	else
-		ms = 2;
-
-	pmu->timer_interval = ms_to_ktime(ms);
+	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 
 	rapl_hrtimer_init(pmu);
 
@@ -676,6 +664,19 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	/* Apply cpu model quirk */
 	if (quirk)
 		quirk();
+
+	/*
+	 * Calculate the timer rate:
+	 * Use reference of 200W for scaling the timeout to avoid counter
+	 * overflows. 200W = 200 Joules/sec
+	 * Divide interval by 2 to avoid lockstep (2 * 100)
+	 * if hw unit is 32, then we use 2 ms 1/200/2
+	 */
+	rapl_timer_ms = 2;
+	if (rapl_hw_unit[0] < 32) {
+		rapl_timer_ms = (1000 / (2 * 100));
+		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+	}
 	return 0;
 }
 
@@ -695,9 +696,7 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	struct rapl_pmu *pmu;
-	int cpu, ret;
-	int i;
+	int cpu, ret, i;
 
 	/*
 	 * check for Intel processor family 6
@@ -758,15 +757,14 @@ static int __init rapl_pmu_init(void)
 	}
 
 	__perf_cpu_notifier(rapl_cpu_notifier);
-
-	pmu = __this_cpu_read(rapl_pmu);
+	cpu_notifier_register_done();
 
 	pr_info("RAPL PMU detected,"
 		" API unit is 2^-32 Joules,"
 		" %d fixed counters"
 		" %llu ms ovfl timer\n",
 		hweight32(rapl_cntr_mask),
-		ktime_to_ms(pmu->timer_interval));
+		rapl_timer_ms);
 	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 		if (rapl_cntr_mask & (1 << i)) {
 			pr_info("hw unit of domain %s 2^-%d Joules\n",
@@ -774,7 +772,6 @@ static int __init rapl_pmu_init(void)
 		}
 	}
 
-	cpu_notifier_register_done();
 	return 0;
 
 out:
-- 
cgit 


From 512089d98457b7913d2e4762a44af52fbcd87470 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:23 +0000
Subject: perf/x86/intel/rapl: Clean up the printk output

The output is inconsistent. Use a proper pr_fmt prefix and split out the
advertisement into a seperate function.

Remove the WARN_ON() in the failure case. It's pointless as we already know
where it failed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.504551295@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index b1d4a2f1feeb..f31e4b417adf 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -44,6 +44,9 @@
  * the duration of the measurement. Tools may use a function such as
  * ldexp(raw_count, -32);
  */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
@@ -144,7 +147,7 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 static inline u64 rapl_scale(u64 v, int cfg)
 {
 	if (cfg > NR_RAPL_DOMAINS) {
-		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 		return v;
 	}
 	/*
@@ -680,6 +683,21 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	return 0;
 }
 
+static void __init rapl_advertise(void)
+{
+	int i;
+
+	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+		hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
+}
+
 static void __init cleanup_rapl_pmus(void)
 {
 	int cpu;
@@ -696,7 +714,7 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	int cpu, ret, i;
+	int cpu, ret;
 
 	/*
 	 * check for Intel processor family 6
@@ -751,30 +769,16 @@ static int __init rapl_pmu_init(void)
 	}
 
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-	if (WARN_ON(ret)) {
-		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+	if (ret)
 		goto out;
-	}
 
 	__perf_cpu_notifier(rapl_cpu_notifier);
 	cpu_notifier_register_done();
-
-	pr_info("RAPL PMU detected,"
-		" API unit is 2^-32 Joules,"
-		" %d fixed counters"
-		" %llu ms ovfl timer\n",
-		hweight32(rapl_cntr_mask),
-		rapl_timer_ms);
-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-		if (rapl_cntr_mask & (1 << i)) {
-			pr_info("hw unit of domain %s 2^-%d Joules\n",
-				rapl_domain_names[i], rapl_hw_unit[i]);
-		}
-	}
-
+	rapl_advertise();
 	return 0;
 
 out:
+	pr_warn("Initialization failed (%d), disabled\n", ret);
 	cleanup_rapl_pmus();
 	cpu_notifier_register_done();
 	return ret;
-- 
cgit 


From 7162b8fea63061b6231bc5e8a0fed55167e71b4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:24 +0000
Subject: perf/x86/intel/rapl: Refactor the code some more

Split out code from init into seperate functions. Tidy up the code and get rid
of pointless comments. I wish there would be comments for code which is not
obvious....

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.588544679@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 61 ++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index f31e4b417adf..ba5043b12c3e 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -110,7 +110,7 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
 static struct kobj_attribute format_attr_##_var =		\
 	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+#define RAPL_CNTR_WIDTH 32
 
 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
 static struct perf_pmu_events_attr event_attr_##v = {				\
@@ -120,15 +120,16 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 };
 
 struct rapl_pmu {
-	spinlock_t	 lock;
-	int		 n_active; /* number of active events */
-	struct list_head active_list;
-	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
-	ktime_t		 timer_interval; /* in ktime_t unit */
-	struct hrtimer   hrtimer;
+	spinlock_t		lock;
+	int			n_active;
+	struct list_head	active_list;
+	struct pmu		*pmu;
+	ktime_t			timer_interval;
+	struct hrtimer		hrtimer;
 };
 
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
@@ -200,11 +201,6 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 		     HRTIMER_MODE_REL_PINNED);
 }
 
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-	hrtimer_cancel(&pmu->hrtimer);
-}
-
 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 {
 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
@@ -216,9 +212,8 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 
 	spin_lock_irqsave(&pmu->lock, flags);
 
-	list_for_each_entry(event, &pmu->active_list, active_entry) {
+	list_for_each_entry(event, &pmu->active_list, active_entry)
 		rapl_event_update(event);
-	}
 
 	spin_unlock_irqrestore(&pmu->lock, flags);
 
@@ -275,7 +270,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 		WARN_ON_ONCE(pmu->n_active <= 0);
 		pmu->n_active--;
 		if (pmu->n_active == 0)
-			rapl_stop_hrtimer(pmu);
+			hrtimer_cancel(&pmu->hrtimer);
 
 		list_del(&event->active_entry);
 
@@ -542,7 +537,7 @@ static void rapl_cpu_exit(int cpu)
 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
 
 	/* cancel overflow polling timer for CPU */
-	rapl_stop_hrtimer(pmu);
+	hrtimer_cancel(&pmu->hrtimer);
 }
 
 static void rapl_cpu_init(int cpu)
@@ -698,6 +693,20 @@ static void __init rapl_advertise(void)
 	}
 }
 
+static int __init rapl_prepare_cpus(void)
+{
+	unsigned int cpu;
+	int ret;
+
+	for_each_online_cpu(cpu) {
+		ret = rapl_cpu_prepare(cpu);
+		if (ret)
+			return ret;
+		rapl_cpu_init(cpu);
+	}
+	return 0;
+}
+
 static void __init cleanup_rapl_pmus(void)
 {
 	int cpu;
@@ -706,7 +715,7 @@ static void __init cleanup_rapl_pmus(void)
 		kfree(per_cpu(rapl_pmu, cpu));
 }
 
-static const struct x86_cpu_id rapl_cpu_match[] = {
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
 };
@@ -714,15 +723,11 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	int cpu, ret;
+	int ret;
 
-	/*
-	 * check for Intel processor family 6
-	 */
 	if (!x86_match_cpu(rapl_cpu_match))
 		return -ENODEV;
 
-	/* check supported CPU */
 	switch (boot_cpu_data.x86_model) {
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
@@ -751,7 +756,6 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
 	default:
-		/* unsupported */
 		return -ENODEV;
 	}
 
@@ -761,12 +765,9 @@ static int __init rapl_pmu_init(void)
 
 	cpu_notifier_register_begin();
 
-	for_each_online_cpu(cpu) {
-		ret = rapl_cpu_prepare(cpu);
-		if (ret)
-			goto out;
-		rapl_cpu_init(cpu);
-	}
+	ret = rapl_prepare_cpus();
+	if (ret)
+		goto out;
 
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 	if (ret)
-- 
cgit 


From a208749c642618b6c106874153906b53f8e2ded9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:25 +0000
Subject: perf/x86/intel/rapl: Make PMU lock raw

This lock is taken in hard interrupt context even on Preempt-RT. Make it raw
so RT does not have to patch it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.669411833@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index ba5043b12c3e..29349990e11c 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -120,7 +120,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 };
 
 struct rapl_pmu {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	int			n_active;
 	struct list_head	active_list;
 	struct pmu		*pmu;
@@ -210,12 +210,12 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 	if (!pmu->n_active)
 		return HRTIMER_NORESTART;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	list_for_each_entry(event, &pmu->active_list, active_entry)
 		rapl_event_update(event);
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 
 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
 
@@ -252,9 +252,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 	__rapl_pmu_event_start(pmu, event);
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
@@ -263,7 +263,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	/* mark event as deactivated and stopped */
 	if (!(hwc->state & PERF_HES_STOPPED)) {
@@ -288,7 +288,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static int rapl_pmu_event_add(struct perf_event *event, int mode)
@@ -297,14 +297,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_START)
 		__rapl_pmu_event_start(pmu, event);
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 
 	return 0;
 }
@@ -567,7 +567,7 @@ static int rapl_cpu_prepare(int cpu)
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
 		return -1;
-	spin_lock_init(&pmu->lock);
+	raw_spin_lock_init(&pmu->lock);
 
 	INIT_LIST_HEAD(&pmu->active_list);
 
-- 
cgit 


From 8a6d2f8f73caa8b8eb596a9e2d2e0b15d64751a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:25 +0000
Subject: perf/x86/intel/rapl: Utilize event->pmu_private

Store the PMU pointer in event->pmu_private and use it instead of the per CPU
data. Preparatory step to get rid of the per CPU allocations. The usage sites
are the perf fast path, so we keep that even after the conversion to per
package storage as a CPU to package lookup involves 3 loads versus 1 with the
pmu_private pointer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.748151799@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 29349990e11c..753d90cbcce2 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -122,6 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 struct rapl_pmu {
 	raw_spinlock_t		lock;
 	int			n_active;
+	int			cpu;
 	struct list_head	active_list;
 	struct pmu		*pmu;
 	ktime_t			timer_interval;
@@ -203,7 +204,7 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 
 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 	struct perf_event *event;
 	unsigned long flags;
 
@@ -249,7 +250,7 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 
 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&pmu->lock, flags);
@@ -259,7 +260,7 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
 
 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
@@ -293,7 +294,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 
 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
@@ -316,6 +317,7 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
 
 static int rapl_pmu_event_init(struct perf_event *event)
 {
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, msr, ret = 0;
 
@@ -327,6 +329,9 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->attr.config & ~RAPL_EVENT_MASK)
 		return -EINVAL;
 
+	if (event->cpu < 0)
+		return -EINVAL;
+
 	/*
 	 * check event is known (determines counter)
 	 */
@@ -365,6 +370,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
+	event->cpu = pmu->cpu;
+	event->pmu_private = pmu;
 	event->hw.event_base = msr;
 	event->hw.config = cfg;
 	event->hw.idx = bit;
@@ -572,6 +579,7 @@ static int rapl_cpu_prepare(int cpu)
 	INIT_LIST_HEAD(&pmu->active_list);
 
 	pmu->pmu = &rapl_pmu_class;
+	pmu->cpu = cpu;
 
 	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 
-- 
cgit 


From 9de8d686955b0e8e27847ed4edbbcd280f3fd853 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:26 +0000
Subject: perf/x86/intel/rapl: Convert it to a per package facility

RAPL is a per package facility and we already have a mechanism for a dedicated
per package reader. So there is no point to have multiple CPUs doing the
same. The current implementation actually starts two timers on two CPUs if one
does:

	perf stat -C1,2 -e -e power/energy-pkg ....

which makes the whole concept of 1 reader per package moot.

What's worse is that the above returns the double of the actual energy
consumption, but that's a different problem to address and cannot be solved by
removing the pointless per cpuness of that mechanism.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.845369524@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 194 +++++++++++++++++++------------------------
 1 file changed, 86 insertions(+), 108 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 753d90cbcce2..019e541fa988 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -129,15 +129,23 @@ struct rapl_pmu {
 	struct hrtimer		hrtimer;
 };
 
+struct rapl_pmus {
+	struct pmu		pmu;
+	unsigned int		maxpkg;
+	struct rapl_pmu		*pmus[];
+};
+
  /* 1/2^hw_unit Joule */
 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
-static struct pmu rapl_pmu_class;
+static struct rapl_pmus *rapl_pmus;
 static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
+static unsigned int rapl_cntr_mask;
 static u64 rapl_timer_ms;
 
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+	return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
 
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
@@ -317,12 +325,12 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
 
 static int rapl_pmu_event_init(struct perf_event *event)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, msr, ret = 0;
+	struct rapl_pmu *pmu;
 
 	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmu_class.type)
+	if (event->attr.type != rapl_pmus->pmu.type)
 		return -ENOENT;
 
 	/* check only supported bits are set */
@@ -370,6 +378,7 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
+	pmu = cpu_to_rapl_pmu(event->cpu);
 	event->cpu = pmu->cpu;
 	event->pmu_private = pmu;
 	event->hw.event_base = msr;
@@ -502,116 +511,62 @@ const struct attribute_group *rapl_attr_groups[] = {
 	NULL,
 };
 
-static struct pmu rapl_pmu_class = {
-	.attr_groups	= rapl_attr_groups,
-	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
-	.event_init	= rapl_pmu_event_init,
-	.add		= rapl_pmu_event_add, /* must have */
-	.del		= rapl_pmu_event_del, /* must have */
-	.start		= rapl_pmu_event_start,
-	.stop		= rapl_pmu_event_stop,
-	.read		= rapl_pmu_event_read,
-};
-
 static void rapl_cpu_exit(int cpu)
 {
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int i, phys_id = topology_physical_package_id(cpu);
-	int target = -1;
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	int target;
 
-	/* find a new cpu on same package */
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-	/*
-	 * clear cpu from cpumask
-	 * if was set in cpumask and still some cpu on package,
-	 * then move to new cpu
-	 */
-	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-		cpumask_set_cpu(target, &rapl_cpu_mask);
+	/* Check if exiting cpu is used for collecting rapl events */
+	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+		return;
 
-	WARN_ON(cpumask_empty(&rapl_cpu_mask));
-	/*
-	 * migrate events and context to new cpu
-	 */
-	if (target >= 0)
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+	pmu->cpu = -1;
+	/* Find a new cpu to collect rapl events */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-	/* cancel overflow polling timer for CPU */
-	hrtimer_cancel(&pmu->hrtimer);
+	/* Migrate rapl events to the new target */
+	if (target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &rapl_cpu_mask);
+		pmu->cpu = target;
+		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+	}
 }
 
 static void rapl_cpu_init(int cpu)
 {
-	int i, phys_id = topology_physical_package_id(cpu);
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	int target;
+
+	/*
+	 * Check if there is an online cpu in the package which collects rapl
+	 * events already.
+	 */
+	target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return;
 
-	/* check if phys_is is already covered */
-	for_each_cpu(i, &rapl_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-	/* was not found, so add it */
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+	pmu->cpu = cpu;
 }
 
 static int rapl_cpu_prepare(int cpu)
 {
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int phys_id = topology_physical_package_id(cpu);
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 
 	if (pmu)
 		return 0;
 
-	if (phys_id < 0)
-		return -1;
-
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
-		return -1;
-	raw_spin_lock_init(&pmu->lock);
+		return -ENOMEM;
 
+	raw_spin_lock_init(&pmu->lock);
 	INIT_LIST_HEAD(&pmu->active_list);
-
-	pmu->pmu = &rapl_pmu_class;
-	pmu->cpu = cpu;
-
+	pmu->pmu = &rapl_pmus->pmu;
 	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
-
+	pmu->cpu = -1;
 	rapl_hrtimer_init(pmu);
-
-	/* set RAPL pmu for this cpu for now */
-	per_cpu(rapl_pmu, cpu) = pmu;
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-	return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-	kfree(pmu);
-
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-	if (!pmu)
-		return 0;
-
-	per_cpu(rapl_pmu, cpu) = NULL;
-
-	per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
+	rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
 	return 0;
 }
 
@@ -624,24 +579,16 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	case CPU_UP_PREPARE:
 		rapl_cpu_prepare(cpu);
 		break;
-	case CPU_STARTING:
-		rapl_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		rapl_cpu_dying(cpu);
-		break;
+
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-	case CPU_DEAD:
-		rapl_cpu_kfree(cpu);
+		rapl_cpu_init(cpu);
 		break;
+
 	case CPU_DOWN_PREPARE:
 		rapl_cpu_exit(cpu);
 		break;
-	default:
-		break;
 	}
-
 	return NOTIFY_OK;
 }
 
@@ -703,10 +650,14 @@ static void __init rapl_advertise(void)
 
 static int __init rapl_prepare_cpus(void)
 {
-	unsigned int cpu;
+	unsigned int cpu, pkg;
 	int ret;
 
 	for_each_online_cpu(cpu) {
+		pkg = topology_logical_package_id(cpu);
+		if (rapl_pmus->pmus[pkg])
+			continue;
+
 		ret = rapl_cpu_prepare(cpu);
 		if (ret)
 			return ret;
@@ -717,10 +668,33 @@ static int __init rapl_prepare_cpus(void)
 
 static void __init cleanup_rapl_pmus(void)
 {
-	int cpu;
+	int i;
+
+	for (i = 0; i < rapl_pmus->maxpkg; i++)
+		kfree(rapl_pmus->pmus + i);
+	kfree(rapl_pmus);
+}
 
-	for_each_online_cpu(cpu)
-		kfree(per_cpu(rapl_pmu, cpu));
+static int __init init_rapl_pmus(void)
+{
+	int maxpkg = topology_max_packages();
+	size_t size;
+
+	size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+	rapl_pmus = kzalloc(size, GFP_KERNEL);
+	if (!rapl_pmus)
+		return -ENOMEM;
+
+	rapl_pmus->maxpkg		= maxpkg;
+	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
+	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
+	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
+	rapl_pmus->pmu.add		= rapl_pmu_event_add;
+	rapl_pmus->pmu.del		= rapl_pmu_event_del;
+	rapl_pmus->pmu.start		= rapl_pmu_event_start;
+	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
+	rapl_pmus->pmu.read		= rapl_pmu_event_read;
+	return 0;
 }
 
 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
@@ -771,13 +745,17 @@ static int __init rapl_pmu_init(void)
 	if (ret)
 		return ret;
 
+	ret = init_rapl_pmus();
+	if (ret)
+		return ret;
+
 	cpu_notifier_register_begin();
 
 	ret = rapl_prepare_cpus();
 	if (ret)
 		goto out;
 
-	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 	if (ret)
 		goto out;
 
-- 
cgit 


From fc6d73d67436e7784758a831227bd019547a3f73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:40 +0000
Subject: arch/hotplug: Call into idle with a proper state

Let the non boot cpus call into idle with the corresponding hotplug state, so
the hotplug core can handle the further bringup. That's a first step to
convert the boot side of the hotplugged cpus to do all the synchronization
with the other side through the state machine. For now it'll only start the
hotplug thread and kick the full bringup of the cpu.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.614102639@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 2 +-
 arch/x86/xen/smp.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f77b3c1..293b22a7ab02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -248,7 +248,7 @@ static void notrace start_secondary(void *unused)
 	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_store_boot_cpu_info(void)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3f4ebf0261f2..3c6d17fd423a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 		xen_pvh_secondary_vcpu_init(cpu);
 #endif
 	cpu_bringup();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void xen_smp_intr_free(unsigned int cpu)
-- 
cgit 


From 2680d6da455b636dd006636780c0f235c6561d70 Mon Sep 17 00:00:00 2001
From: Owen Hofmann <osh@google.com>
Date: Tue, 1 Mar 2016 13:36:13 -0800
Subject: kvm: x86: Update tsc multiplier on change.

vmx.c writes the TSC_MULTIPLIER field in vmx_vcpu_load, but only when a
vcpu has migrated physical cpus. Record the last value written and
update in vmx_vcpu_load on any change, otherwise a cpu migration must
occur for TSC frequency scaling to take effect.

Cc: stable@vger.kernel.org
Fixes: ff2c3a1803775cc72dc6f624b59554956396b0ee
Signed-off-by: Owen Hofmann <osh@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6edbbc..0ff453749a90 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,6 +596,8 @@ struct vcpu_vmx {
 	/* Support for PML */
 #define PML_ENTITY_NUM		512
 	struct page *pml_pg;
+
+	u64 current_tsc_ratio;
 };
 
 enum segment_cache_field {
@@ -2127,14 +2129,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 
-		/* Setup TSC multiplier */
-		if (cpu_has_vmx_tsc_scaling())
-			vmcs_write64(TSC_MULTIPLIER,
-				     vcpu->arch.tsc_scaling_ratio);
-
 		vmx->loaded_vmcs->cpu = cpu;
 	}
 
+	/* Setup TSC multiplier */
+	if (kvm_has_tsc_control &&
+	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+		vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+		vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+	}
+
 	vmx_vcpu_pi_load(vcpu, cpu);
 }
 
-- 
cgit 


From 92f9e179a702a6adbc11e2fedc76ecd6ffc9e3f7 Mon Sep 17 00:00:00 2001
From: Todd E Brandt <todd.e.brandt@linux.intel.com>
Date: Wed, 2 Mar 2016 16:05:29 -0800
Subject: PM / sleep / x86: Fix crash on graph trace through x86 suspend

Pause/unpause graph tracing around do_suspend_lowlevel as it has
inconsistent call/return info after it jumps to the wakeup vector.
The graph trace buffer will otherwise become misaligned and
may eventually crash and hang on suspend.

To reproduce the issue and test the fix:
Run a function_graph trace over suspend/resume and set the graph
function to suspend_devices_and_enter. This consistently hangs the
system without this fix.

Signed-off-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/sleep.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d1daead5fcdd..adb3eaf8fe2a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 
+#include <linux/ftrace.h>
 #include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+	/*
+	 * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+	 * inconsistent call/return info after it jumps to the wakeup vector.
+	 */
+	pause_graph_tracing();
 	do_suspend_lowlevel();
+	unpause_graph_tracing();
 	return 0;
 }
 
-- 
cgit 


From 6f6e151692625e29810f2fe036b4f410540567c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 3 Mar 2016 09:12:27 +0100
Subject: perf/x86/uncore: Fix build on UP-IOAPIC configs

Commit:

  cf6d445f6897 ("perf/x86/uncore: Track packages, not per CPU data")

reorganized the uncore code to track packages, and introduced a dependency
on MAX_APIC_ID. This constant is not available on UP-IOAPIC builds:

  arch/x86/events/intel/uncore.c:1350:44: error: 'MAX_LOCAL_APIC' undeclared here (not in a function)

Include asm/apicdef.h explicitly to pick it up.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 01c67935bb28..79766b9a3580 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <asm/apicdef.h>
+
 #include <linux/perf_event.h>
 #include "../perf_event.h"
 
-- 
cgit 


From 9e4aabe2bb3454c83dac8139cf9974503ee044db Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 29 Feb 2016 16:04:43 +0100
Subject: kvm: x86: Convert ioapic->rtc_status.dest_map to a struct

Currently this is a bitmap which tracks which CPUs we expect
an EOI from. Move this bitmap to a struct so that we can
track additional information there.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.c   | 13 +++++++------
 arch/x86/kvm/ioapic.h   | 10 ++++++++--
 arch/x86/kvm/irq_comm.c |  2 +-
 arch/x86/kvm/lapic.c    | 10 +++++-----
 arch/x86/kvm/lapic.h    |  7 +++++--
 5 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 1facfd60b04a..f2c9906c5849 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
 {
 	ioapic->rtc_status.pending_eoi = 0;
-	bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS);
 }
 
 static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
 		return;
 
 	new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
-	old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+	old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 
 	if (new_val == old_val)
 		return;
 
 	if (new_val) {
-		__set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+		__set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 		ioapic->rtc_status.pending_eoi++;
 	} else {
-		__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+		__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 		ioapic->rtc_status.pending_eoi--;
 		rtc_status_pending_eoi_check_valid(ioapic);
 	}
@@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
 
 static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
 {
-	if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+	if (test_and_clear_bit(vcpu->vcpu_id,
+			       ioapic->rtc_status.dest_map.map)) {
 		--ioapic->rtc_status.pending_eoi;
 		rtc_status_pending_eoi_check_valid(ioapic);
 	}
@@ -346,7 +347,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 		 */
 		BUG_ON(ioapic->rtc_status.pending_eoi != 0);
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
-				ioapic->rtc_status.dest_map);
+					       &ioapic->rtc_status.dest_map);
 		ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
 	} else
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 2d16dc251d81..af729890f6f5 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -40,9 +40,14 @@ struct kvm_vcpu;
 #define RTC_GSI -1U
 #endif
 
+struct dest_map {
+	DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+};
+
+
 struct rtc_status {
 	int pending_eoi;
-	DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+	struct dest_map dest_map;
 };
 
 union kvm_ioapic_redirect_entry {
@@ -118,7 +123,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 		       int level, bool line_status);
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, unsigned long *dest_map);
+			     struct kvm_lapic_irq *irq,
+			     struct dest_map *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 37217363887d..54ead79e444b 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -54,7 +54,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, unsigned long *dest_map)
+		struct kvm_lapic_irq *irq, struct dest_map *dest_map)
 {
 	int i, r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cf74404230ca..e085695e6b5a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -485,10 +485,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode,
-			     unsigned long *dest_map);
+			     struct dest_map *dest_map);
 
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-		unsigned long *dest_map)
+		     struct dest_map *dest_map)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
@@ -695,7 +695,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
 {
 	struct kvm_apic_map *map;
 	unsigned long bitmap = 1;
@@ -894,7 +894,7 @@ out:
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode,
-			     unsigned long *dest_map)
+			     struct dest_map *dest_map)
 {
 	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -915,7 +915,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		result = 1;
 
 		if (dest_map)
-			__set_bit(vcpu->vcpu_id, dest_map);
+			__set_bit(vcpu->vcpu_id, dest_map->map);
 
 		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
 			if (trig_mode)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 59610099af04..f71183e502ee 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -42,6 +42,9 @@ struct kvm_lapic {
 	unsigned long pending_events;
 	unsigned int sipi_vector;
 };
+
+struct dest_map;
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
@@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-		unsigned long *dest_map);
+		     struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
+		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-- 
cgit 


From 9daa50076f585854f0040aa8403eac020d6f5d64 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 29 Feb 2016 16:04:44 +0100
Subject: kvm: x86: Track irq vectors in ioapic->rtc_status.dest_map

This allows backtracking later in case the rtc irq has been
moved to another vcpu/vector.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.h | 7 +++++++
 arch/x86/kvm/lapic.c  | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index af729890f6f5..7d2692a49657 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -41,7 +41,14 @@ struct kvm_vcpu;
 #endif
 
 struct dest_map {
+	/* vcpu bitmap where IRQ has been sent */
 	DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+
+	/*
+	 * Vector sent to a given vcpu, only valid when
+	 * the vcpu's bit in map is set
+	 */
+	u8 vectors[KVM_MAX_VCPUS];
 };
 
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e085695e6b5a..d9ae1ce2a6a0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -914,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 		result = 1;
 
-		if (dest_map)
+		if (dest_map) {
 			__set_bit(vcpu->vcpu_id, dest_map->map);
+			dest_map->vectors[vcpu->vcpu_id] = vector;
+		}
 
 		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
 			if (trig_mode)
-- 
cgit 


From 4d99ba898dd0c521ca6cdfdde55c9b58aea3cb3d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 29 Feb 2016 16:04:45 +0100
Subject: kvm: x86: Check dest_map->vector to match eoi signals for rtc

Using the vector stored at interrupt delivery makes the eoi
matching safe agains irq migration in the ioapic.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index f2c9906c5849..9db47090ead0 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -237,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+	struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
 	union kvm_ioapic_redirect_entry *e;
 	int index;
 
 	spin_lock(&ioapic->lock);
+
+	/* Make sure we see any missing RTC EOI */
+	if (test_bit(vcpu->vcpu_id, dest_map->map))
+		__set_bit(dest_map->vectors[vcpu->vcpu_id],
+			  ioapic_handled_vectors);
+
 	for (index = 0; index < IOAPIC_NUM_PINS; index++) {
 		e = &ioapic->redirtbl[index];
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
@@ -408,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
-	int i;
+	struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int i;
+
+	/* RTC special handling */
+	if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+	    vector == dest_map->vectors[vcpu->vcpu_id])
+		rtc_irq_eoi(ioapic, vcpu);
 
 	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -417,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		if (ent->fields.vector != vector)
 			continue;
 
-		if (i == RTC_GSI)
-			rtc_irq_eoi(ioapic, vcpu);
 		/*
 		 * We are dropping lock while calling ack notifiers because ack
 		 * notifier callbacks for assigned devices call into IOAPIC
-- 
cgit 


From 92f94f1e9e509caa564353c516c904278999e350 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:06 +0800
Subject: KVM: MMU: rename has_wrprotected_page to mmu_gfn_lpage_is_disallowed

kvm_lpage_info->write_count is used to detect if the large page mapping
for the gfn on the specified level is allowed, rename it to disallow_lpage
to reflect its purpose, also we rename has_wrprotected_page() to
mmu_gfn_lpage_is_disallowed() to make the code more clearer

Later we will extend this mechanism for page tracking: if the gfn is
tracked then large mapping for that gfn on any level is not allowed.
The new name is more straightforward

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 25 +++++++++++++------------
 arch/x86/kvm/x86.c              | 14 ++++++++------
 3 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b5459982433..1f7fed5f35fc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -644,7 +644,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-	int write_count;
+	int disallow_lpage;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07f4c26a10d3..06dd4abfeaa3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -789,7 +789,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = __gfn_to_memslot(slots, gfn);
 	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->write_count += 1;
+		linfo->disallow_lpage += 1;
 	}
 	kvm->arch.indirect_shadow_pages++;
 }
@@ -807,31 +807,32 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = __gfn_to_memslot(slots, gfn);
 	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->write_count -= 1;
-		WARN_ON(linfo->write_count < 0);
+		linfo->disallow_lpage -= 1;
+		WARN_ON(linfo->disallow_lpage < 0);
 	}
 	kvm->arch.indirect_shadow_pages--;
 }
 
-static int __has_wrprotected_page(gfn_t gfn, int level,
-				  struct kvm_memory_slot *slot)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+					  struct kvm_memory_slot *slot)
 {
 	struct kvm_lpage_info *linfo;
 
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
-		return linfo->write_count;
+		return !!linfo->disallow_lpage;
 	}
 
-	return 1;
+	return true;
 }
 
-static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+					int level)
 {
 	struct kvm_memory_slot *slot;
 
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	return __has_wrprotected_page(gfn, level, slot);
+	return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
 }
 
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -897,7 +898,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (__has_wrprotected_page(large_gfn, level, slot))
+		if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
 			break;
 
 	return level - 1;
@@ -2503,7 +2504,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 * be fixed if guest refault.
 		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
-		    has_wrprotected_page(vcpu, gfn, level))
+		    mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
 			goto done;
 
 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2768,7 +2769,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompound(pfn_to_page(pfn)) &&
-	    !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+	    !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
 		unsigned long mask;
 		/*
 		 * mmu_notifier_retry was successful and we hold the
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2fb92c0af803..668625e47102 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7879,6 +7879,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	int i;
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+		struct kvm_lpage_info *linfo;
 		unsigned long ugfn;
 		int lpages;
 		int level = i + 1;
@@ -7893,15 +7894,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 		if (i == 0)
 			continue;
 
-		slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-					sizeof(*slot->arch.lpage_info[i - 1]));
-		if (!slot->arch.lpage_info[i - 1])
+		linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+		if (!linfo)
 			goto out_free;
 
+		slot->arch.lpage_info[i - 1] = linfo;
+
 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i - 1][0].write_count = 1;
+			linfo[0].disallow_lpage = 1;
 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+			linfo[lpages - 1].disallow_lpage = 1;
 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
@@ -7913,7 +7915,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			unsigned long j;
 
 			for (j = 0; j < lpages; ++j)
-				slot->arch.lpage_info[i - 1][j].write_count = 1;
+				linfo[j].disallow_lpage = 1;
 		}
 	}
 
-- 
cgit 


From 547ffaed871af7db8713972eaf630802cac47cb1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:07 +0800
Subject: KVM: MMU: introduce kvm_mmu_gfn_{allow,disallow}_lpage

Abstract the common operations from account_shadowed() and
unaccount_shadowed(), then introduce kvm_mmu_gfn_disallow_lpage()
and kvm_mmu_gfn_allow_lpage()

These two functions will be used by page tracking in the later patch

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 38 +++++++++++++++++++++++++-------------
 arch/x86/kvm/mmu.h |  3 +++
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 06dd4abfeaa3..566639dd13b3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -776,21 +776,39 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 	return &slot->arch.lpage_info[level - 2][idx];
 }
 
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+					    gfn_t gfn, int count)
+{
+	struct kvm_lpage_info *linfo;
+	int i;
+
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->disallow_lpage += count;
+		WARN_ON(linfo->disallow_lpage < 0);
+	}
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
-	struct kvm_lpage_info *linfo;
 	gfn_t gfn;
-	int i;
 
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->disallow_lpage += 1;
-	}
+	kvm_mmu_gfn_disallow_lpage(slot, gfn);
 	kvm->arch.indirect_shadow_pages++;
 }
 
@@ -798,18 +816,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
-	struct kvm_lpage_info *linfo;
 	gfn_t gfn;
-	int i;
 
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->disallow_lpage -= 1;
-		WARN_ON(linfo->disallow_lpage < 0);
-	}
+	kvm_mmu_gfn_allow_lpage(slot, gfn);
 	kvm->arch.indirect_shadow_pages--;
 }
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 55ffb7b0f95e..de92bed207f1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -174,4 +174,7 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 #endif
-- 
cgit 


From aeecee2ea6e2b020de8bb562f4e79ab34eda3e22 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:08 +0800
Subject: KVM: MMU: introduce kvm_mmu_slot_gfn_write_protect

Split rmap_write_protect() and introduce the function to abstract the write
protection based on the slot

This function will be used in the later patch

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 16 +++++++++++-----
 arch/x86/kvm/mmu.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 566639dd13b3..7184218acf78 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1336,23 +1336,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+				    struct kvm_memory_slot *slot, u64 gfn)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_rmap_head *rmap_head;
 	int i;
 	bool write_protected = false;
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
 	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmap_head = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
 	}
 
 	return write_protected;
 }
 
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
 	u64 *sptep;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index de92bed207f1..58fe98a0a526 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -177,4 +177,6 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 
 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+				    struct kvm_memory_slot *slot, u64 gfn);
 #endif
-- 
cgit 


From 21ebbedaddf25a35a70fedc001ba7e5f5b9129bc Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:09 +0800
Subject: KVM: page track: add the framework of guest page tracking

The array, gfn_track[mode][gfn], is introduced in memory slot for every
guest page, this is the tracking count for the gust page on different
modes. If the page is tracked then the count is increased, the page is
not tracked after the count reaches zero

We use 'unsigned short' as the tracking count which should be enough as
shadow page table only can use 2^14 (2^3 for level, 2^1 for cr4_pae, 2^2
for quadrant, 2^3 for access, 2^1 for nxe, 2^1 for cr0_wp, 2^1 for
smep_andnot_wp, 2^1 for smap_andnot_wp, and 2^1 for smm) at most, there
is enough room for other trackers

Two callbacks, kvm_page_track_create_memslot() and
kvm_page_track_free_memslot() are implemented in this patch, they are
internally used to initialize and reclaim the memory of the array

Currently, only write track mode is supported

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h       | 10 +++++++
 arch/x86/include/asm/kvm_page_track.h | 13 +++++++++
 arch/x86/kvm/Makefile                 |  3 +-
 arch/x86/kvm/page_track.c             | 52 +++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                    |  5 ++++
 5 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/kvm_page_track.h
 create mode 100644 arch/x86/kvm/page_track.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1f7fed5f35fc..71e43fe04bbc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 #include <asm/asm.h>
+#include <asm/kvm_page_track.h>
 
 #define KVM_MAX_VCPUS 255
 #define KVM_SOFT_MAX_VCPUS 160
@@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 };
 
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
@@ -650,6 +659,7 @@ struct kvm_lpage_info {
 struct kvm_arch_memory_slot {
 	struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
 };
 
 /*
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644
index 000000000000..5520040682d1
--- /dev/null
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+enum kvm_page_track_mode {
+	KVM_PAGE_TRACK_WRITE,
+	KVM_PAGE_TRACK_MAX,
+};
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+				 struct kvm_memory_slot *dont);
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+				  unsigned long npages);
+#endif
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a1ff508bb423..464fa477afbf 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o
+			   hyperv.o page_track.o
 
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
+
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 000000000000..8c396d08c0a4
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,52 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+				 struct kvm_memory_slot *dont)
+{
+	int i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+		if (!dont || free->arch.gfn_track[i] !=
+		      dont->arch.gfn_track[i]) {
+			kvfree(free->arch.gfn_track[i]);
+			free->arch.gfn_track[i] = NULL;
+		}
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+				  unsigned long npages)
+{
+	int  i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+		slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
+					    sizeof(*slot->arch.gfn_track[i]));
+		if (!slot->arch.gfn_track[i])
+			goto track_free;
+	}
+
+	return 0;
+
+track_free:
+	kvm_page_track_free_memslot(slot, NULL);
+	return -ENOMEM;
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 668625e47102..7b4cfea09deb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7871,6 +7871,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			free->arch.lpage_info[i - 1] = NULL;
 		}
 	}
+
+	kvm_page_track_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7919,6 +7921,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 		}
 	}
 
+	if (kvm_page_track_create_memslot(slot, npages))
+		goto out_free;
+
 	return 0;
 
 out_free:
-- 
cgit 


From f29d4d7810d7fd61442371cd68957e1d37ed79bb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:10 +0800
Subject: KVM: page track: introduce kvm_slot_page_track_{add,remove}_page

These two functions are the user APIs:
- kvm_slot_page_track_add_page(): add the page to the tracking pool
  after that later specified access on that page will be tracked

- kvm_slot_page_track_remove_page(): remove the page from the tracking
  pool, the specified access on the page is not tracked after the last
  user is gone

Both of these are called under the protection both of mmu-lock and
kvm->srcu or kvm->slots_lock

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_page_track.h |  7 +++
 arch/x86/kvm/page_track.c             | 85 +++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 5520040682d1..e363e3040ba4 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -10,4 +10,11 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
 				 struct kvm_memory_slot *dont);
 int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
 				  unsigned long npages);
+
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+				  struct kvm_memory_slot *slot, gfn_t gfn,
+				  enum kvm_page_track_mode mode);
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+				     struct kvm_memory_slot *slot, gfn_t gfn,
+				     enum kvm_page_track_mode mode);
 #endif
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 8c396d08c0a4..cd76bc318968 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -50,3 +50,88 @@ track_free:
 	kvm_page_track_free_memslot(slot, NULL);
 	return -ENOMEM;
 }
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+	if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+		return false;
+
+	return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+			     enum kvm_page_track_mode mode, short count)
+{
+	int index, val;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+	val = slot->arch.gfn_track[mode][index];
+
+	if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+		return;
+
+	slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+				  struct kvm_memory_slot *slot, gfn_t gfn,
+				  enum kvm_page_track_mode mode)
+{
+
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return;
+
+	update_gfn_track(slot, gfn, mode, 1);
+
+	/*
+	 * new track stops large page mapping for the
+	 * tracked page.
+	 */
+	kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+	if (mode == KVM_PAGE_TRACK_WRITE)
+		if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+			kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+				     struct kvm_memory_slot *slot, gfn_t gfn,
+				     enum kvm_page_track_mode mode)
+{
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return;
+
+	update_gfn_track(slot, gfn, mode, -1);
+
+	/*
+	 * allow large page mapping for the tracked page
+	 * after the tracker is gone.
+	 */
+	kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
-- 
cgit 


From 3d0c27ad6ee465f174b09ee99fcaf189c57d567a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:11 +0800
Subject: KVM: MMU: let page fault handler be aware tracked page

The page fault caused by write access on the write tracked page can not
be fixed, it always need to be emulated. page_fault_handle_page_track()
is the fast path we introduce here to skip holding mmu-lock and shadow
page table walking

However, if the page table is not present, it is worth making the page
table entry present and readonly to make the read access happy

mmu_need_write_protect() need to be cooked to avoid page becoming writable
when making page table present or sync/prefetch shadow page table entries

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_page_track.h |  2 ++
 arch/x86/kvm/mmu.c                    | 44 +++++++++++++++++++++++++++++------
 arch/x86/kvm/page_track.c             | 15 ++++++++++++
 arch/x86/kvm/paging_tmpl.h            |  3 +++
 4 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index e363e3040ba4..5f16e2864e73 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -17,4 +17,6 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
 void kvm_slot_page_track_remove_page(struct kvm *kvm,
 				     struct kvm_memory_slot *slot, gfn_t gfn,
 				     enum kvm_page_track_mode mode);
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+			      enum kvm_page_track_mode mode);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7184218acf78..dd8e3ca2d79b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -2448,25 +2449,29 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 	}
 }
 
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-				  bool can_unsync)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   bool can_unsync)
 {
 	struct kvm_mmu_page *s;
 	bool need_unsync = false;
 
+	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
+
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
 		if (!can_unsync)
-			return 1;
+			return true;
 
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
-			return 1;
+			return true;
 
 		if (!s->unsync)
 			need_unsync = true;
 	}
 	if (need_unsync)
 		kvm_unsync_pages(vcpu, gfn);
-	return 0;
+
+	return false;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -3381,21 +3386,43 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
 
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+					 u32 error_code, gfn_t gfn)
+{
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return false;
+
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	/*
+	 * guest is writing the page which is write tracked which can
+	 * not be fixed by page fault handler.
+	 */
+	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
+
+	return false;
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
-	gfn_t gfn;
+	gfn_t gfn = gva >> PAGE_SHIFT;
 	int r;
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
 			     error_code, gfn, prefault);
@@ -3472,6 +3499,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index cd76bc318968..f127f6d04fa1 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -135,3 +135,18 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
 	 */
 	kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+			      enum kvm_page_track_mode mode)
+{
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return false;
+
+	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 05827ff7bd2e..52ae2d94cc9e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -728,6 +728,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
+	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn))
+		return 1;
+
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
-- 
cgit 


From e5691a81e830c12d396b3f219ab999be87a1208f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:12 +0800
Subject: KVM: MMU: clear write-flooding on the fast path of tracked page

If the page fault is caused by write access on write tracked page, the
real shadow page walking is skipped, we lost the chance to clear write
flooding for the page structure current vcpu is using

Fix it by locklessly waking shadow page table to clear write flooding
on the shadow page structure out of mmu-lock. So that we change the
count to atomic_t

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 22 ++++++++++++++++++++--
 arch/x86/kvm/paging_tmpl.h      |  4 +++-
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 71e43fe04bbc..e2fc5c0ec86a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -285,7 +285,7 @@ struct kvm_mmu_page {
 #endif
 
 	/* Number of writes since the last time traversal visited this page.  */
-	int write_flooding_count;
+	atomic_t write_flooding_count;
 };
 
 struct kvm_pio_request {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd8e3ca2d79b..58c067da6efc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2063,7 +2063,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 
 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
 {
-	sp->write_flooding_count = 0;
+	atomic_set(&sp->write_flooding_count,  0);
 }
 
 static void clear_sp_write_flooding_count(u64 *spte)
@@ -3406,6 +3406,23 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	u64 spte;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+		clear_sp_write_flooding_count(iterator.sptep);
+		if (!is_shadow_present_pte(spte))
+			break;
+	}
+	walk_shadow_page_lockless_end(vcpu);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
@@ -4221,7 +4238,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
 	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
 		return false;
 
-	return ++sp->write_flooding_count >= 3;
+	atomic_inc(&sp->write_flooding_count);
+	return atomic_read(&sp->write_flooding_count) >= 3;
 }
 
 /*
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 52ae2d94cc9e..4174cf290fa3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -728,8 +728,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
-	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn))
+	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+		shadow_page_table_clear_flood(vcpu, addr);
 		return 1;
+	}
 
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 
-- 
cgit 


From 0eb05bf290cfe8610d9680b49abef37febd1c38a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:13 +0800
Subject: KVM: page track: add notifier support

Notifier list is introduced so that any node wants to receive the track
event can register to the list

Two APIs are introduced here:
- kvm_page_track_register_notifier(): register the notifier to receive
  track event

- kvm_page_track_unregister_notifier(): stop receiving track event by
  unregister the notifier

The callback, node->track_write() is called when a write access on the
write tracked page happens

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h       |  1 +
 arch/x86/include/asm/kvm_page_track.h | 39 +++++++++++++++++++
 arch/x86/kvm/page_track.c             | 70 +++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                    |  4 ++
 4 files changed, 114 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e2fc5c0ec86a..eb68e6aca0cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -704,6 +704,7 @@ struct kvm_arch {
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head zapped_obsolete_pages;
+	struct kvm_page_track_notifier_head track_notifier_head;
 
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 5f16e2864e73..c2b8d24a235c 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -6,6 +6,36 @@ enum kvm_page_track_mode {
 	KVM_PAGE_TRACK_MAX,
 };
 
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+	struct srcu_struct track_srcu;
+	struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+	struct hlist_node node;
+
+	/*
+	 * It is called when guest is writing the write-tracked page
+	 * and write emulation is finished at that time.
+	 *
+	 * @vcpu: the vcpu where the write access happened.
+	 * @gpa: the physical address written by guest.
+	 * @new: the data was written to the address.
+	 * @bytes: the written length.
+	 */
+	void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			    int bytes);
+};
+
+void kvm_page_track_init(struct kvm *kvm);
+
 void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
 				 struct kvm_memory_slot *dont);
 int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
@@ -19,4 +49,13 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
 				     enum kvm_page_track_mode mode);
 bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 			      enum kvm_page_track_mode mode);
+
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+				 struct kvm_page_track_notifier_node *n);
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+				   struct kvm_page_track_notifier_node *n);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			  int bytes);
 #endif
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index f127f6d04fa1..11f76436f74f 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -150,3 +150,73 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
 }
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+	init_srcu_struct(&head->track_srcu);
+	INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+				 struct kvm_page_track_notifier_node *n)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+
+	spin_lock(&kvm->mmu_lock);
+	hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+				   struct kvm_page_track_notifier_node *n)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+
+	spin_lock(&kvm->mmu_lock);
+	hlist_del_rcu(&n->node);
+	spin_unlock(&kvm->mmu_lock);
+	synchronize_srcu(&head->track_srcu);
+}
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			  int bytes)
+{
+	struct kvm_page_track_notifier_head *head;
+	struct kvm_page_track_notifier_node *n;
+	int idx;
+
+	head = &vcpu->kvm->arch.track_notifier_head;
+
+	if (hlist_empty(&head->track_notifier_list))
+		return;
+
+	idx = srcu_read_lock(&head->track_srcu);
+	hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+		if (n->track_write)
+			n->track_write(vcpu, gpa, new, bytes);
+	srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b4cfea09deb..b81c14ef1e1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4346,6 +4346,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	if (ret < 0)
 		return 0;
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	kvm_page_track_write(vcpu, gpa, val, bytes);
 	return 1;
 }
 
@@ -4604,6 +4605,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 
 	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+	kvm_page_track_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
 
@@ -7724,6 +7726,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+	kvm_page_track_init(kvm);
+
 	return 0;
 }
 
-- 
cgit 


From 56ca57f9fe553da122472a15e49c2d808123ff0a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:14 +0800
Subject: KVM: MMU: use page track for non-leaf shadow pages

non-leaf shadow pages are always write protected, it can be the user
of page track

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 58c067da6efc..fe03d2a1d4d3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -806,11 +806,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	struct kvm_memory_slot *slot;
 	gfn_t gfn;
 
+	kvm->arch.indirect_shadow_pages++;
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
+
+	/* the non-leaf shadow pages are keeping readonly. */
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return kvm_slot_page_track_add_page(kvm, slot, gfn,
+						    KVM_PAGE_TRACK_WRITE);
+
 	kvm_mmu_gfn_disallow_lpage(slot, gfn);
-	kvm->arch.indirect_shadow_pages++;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -819,11 +825,15 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	struct kvm_memory_slot *slot;
 	gfn_t gfn;
 
+	kvm->arch.indirect_shadow_pages--;
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+						       KVM_PAGE_TRACK_WRITE);
+
 	kvm_mmu_gfn_allow_lpage(slot, gfn);
-	kvm->arch.indirect_shadow_pages--;
 }
 
 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
@@ -2132,12 +2142,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_add_head(&sp->hash_link,
 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
-		if (rmap_write_protect(vcpu, gfn))
+		/*
+		 * we should do write protection before syncing pages
+		 * otherwise the content of the synced shadow page may
+		 * be inconsistent with guest page table.
+		 */
+		account_shadowed(vcpu->kvm, sp);
+		if (level == PT_PAGE_TABLE_LEVEL &&
+		      rmap_write_protect(vcpu, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
+
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			kvm_sync_pages(vcpu, gfn);
-
-		account_shadowed(vcpu->kvm, sp);
 	}
 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	clear_page(sp->spt);
-- 
cgit 


From 5c520e90af3ad546bf328d2c9306c72bf3da6afe Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:15 +0800
Subject: KVM: MMU: simplify mmu_need_write_protect

Now, all non-leaf shadow page are page tracked, if gfn is not tracked
there is no non-leaf shadow page of gfn is existed, we can directly
make the shadow page of gfn to unsync

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fe03d2a1d4d3..d6b264b492c9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2444,7 +2444,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
@@ -2453,39 +2453,24 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
-{
-	struct kvm_mmu_page *s;
-
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-		if (s->unsync)
-			continue;
-		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		__kvm_unsync_page(vcpu, s);
-	}
-}
-
 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				   bool can_unsync)
 {
-	struct kvm_mmu_page *s;
-	bool need_unsync = false;
+	struct kvm_mmu_page *sp;
 
 	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
 		return true;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
 		if (!can_unsync)
 			return true;
 
-		if (s->role.level != PT_PAGE_TABLE_LEVEL)
-			return true;
+		if (sp->unsync)
+			continue;
 
-		if (!s->unsync)
-			need_unsync = true;
+		WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+		kvm_unsync_page(vcpu, sp);
 	}
-	if (need_unsync)
-		kvm_unsync_pages(vcpu, gfn);
 
 	return false;
 }
-- 
cgit 


From 13d268ca2c4c29d6da2ba79419f9a655e602afed Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 17:51:16 +0800
Subject: KVM: MMU: apply page track notifier

Register the notifier to receive write track event so that we can update
our shadow page table

It makes kvm_mmu_pte_write() be the callback of the notifier, no function
is changed

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/mmu.c              | 19 +++++++++++++++++--
 arch/x86/kvm/x86.c              |  4 ++--
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eb68e6aca0cf..1c3e390993a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -704,6 +704,7 @@ struct kvm_arch {
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head zapped_obsolete_pages;
+	struct kvm_page_track_notifier_node mmu_sp_tracker;
 	struct kvm_page_track_notifier_head track_notifier_head;
 
 	struct list_head assigned_dev_head;
@@ -1001,6 +1002,8 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -1140,8 +1143,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d6b264b492c9..42ca0acc1c4e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4302,8 +4302,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
 	return spte;
 }
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+			      const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
@@ -4517,6 +4517,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	init_kvm_mmu(vcpu);
 }
 
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+	node->track_write = kvm_mmu_pte_write;
+	kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+	kvm_page_track_unregister_notifier(kvm, node);
+}
+
 /* The return value indicates if tlb flush on all vcpus is needed. */
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b81c14ef1e1d..f4654e4150b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4345,7 +4345,6 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
 	kvm_page_track_write(vcpu, gpa, val, bytes);
 	return 1;
 }
@@ -4604,7 +4603,6 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 		return X86EMUL_CMPXCHG_FAILED;
 
 	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 	kvm_page_track_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
@@ -7727,6 +7725,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
 	kvm_page_track_init(kvm);
+	kvm_mmu_init_vm(kvm);
 
 	return 0;
 }
@@ -7854,6 +7853,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
 	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+	kvm_mmu_uninit_vm(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-- 
cgit 


From 19072f23d1d785c093b7f81cb1fb161e7a13ecc0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 2 Mar 2016 18:39:36 -0600
Subject: x86/asm/decoder: Use explicitly signed chars

When running objtool on a ppc64le host to analyze x86 binaries, it
reports a lot of false warnings like:

  ipc/compat_mq.o: warning: objtool: compat_SyS_mq_open()+0x91: can't find jump dest instruction at .text+0x3a5

The warnings are caused by the x86 instruction decoder setting the wrong
value for the jump instruction's immediate field because it assumes that
"char == signed char", which isn't true for all architectures.  When
converting char to int, gcc sign-extends on x86 but doesn't sign-extend
on ppc64le.

According to the gcc man page, that's a feature, not a bug:

  > Each kind of machine has a default for what "char" should be.  It is
  > either like "unsigned char" by default or like "signed char" by
  > default.
  >
  > Ideally, a portable program should always use "signed char" or
  > "unsigned char" when it depends on the signedness of an object.

Conform to the "standards" by changing the "char" casts to "signed
char".  This results in no actual changes to the object code on x86.

Note: the x86 decoder now lives in three different locations in the
kernel tree, which are all kept in sync via makefile checks and
warnings: in-kernel, perf, and objtool.  This fixes all three locations.
Eventually we should probably try to at least converge the two separate
"tools" locations into a single shared location.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9dd4161719b20e6def9564646d68bfbe498c549f.1456962210.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/insn.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 8f72b334aea0..1a416935bac9 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -374,7 +374,7 @@ void insn_get_displacement(struct insn *insn)
 		if (mod == 3)
 			goto out;
 		if (mod == 1) {
-			insn->displacement.value = get_next(char, insn);
+			insn->displacement.value = get_next(signed char, insn);
 			insn->displacement.nbytes = 1;
 		} else if (insn->addr_bytes == 2) {
 			if ((mod == 0 && rm == 6) || mod == 2) {
@@ -532,7 +532,7 @@ void insn_get_immediate(struct insn *insn)
 
 	switch (inat_immediate_size(insn->attr)) {
 	case INAT_IMM_BYTE:
-		insn->immediate.value = get_next(char, insn);
+		insn->immediate.value = get_next(signed char, insn);
 		insn->immediate.nbytes = 1;
 		break;
 	case INAT_IMM_WORD:
@@ -566,7 +566,7 @@ void insn_get_immediate(struct insn *insn)
 		goto err_out;
 	}
 	if (inat_has_second_immediate(insn->attr)) {
-		insn->immediate2.value = get_next(char, insn);
+		insn->immediate2.value = get_next(signed char, insn);
 		insn->immediate2.nbytes = 1;
 	}
 done:
-- 
cgit 


From e21555436f196c241503c7c6240272e57783235c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 1 Mar 2016 11:41:33 -0800
Subject: x86/mm/pkeys: Fix access_error() denial of writes to write-only VMA

Andrey Wagin reported that a simple test case was broken by:

	2b5f7d013fc ("mm/core, x86/mm/pkeys: Add execute-only protection keys support")

This test case creates an unreadable VMA and my patch assumed
that all writes must be to readable VMAs.

The simplest fix for this is to remove the pkey-related bits
in access_error().  For execute-only support, I believe the
existing version is sufficient because the permissions we
are trying to enforce are entirely expressed in vma->vm_flags.
We just depend on pkeys to get *an* exception, it does not
matter that PF_PK was set, or even what state PKRU is in.

I will re-add the necessary bits with the full pkeys
implementation that includes the new syscalls.

The three cases that matter are:

1. If a write to an execute-only VMA occurs, we will see PF_WRITE
   set, but !VM_WRITE on the VMA, and return 1.  All execute-only
   VMAs have VM_WRITE clear by definition.
2. If a read occurs on a present PTE, we will fall in to the "read,
   present" case and return 1.
3. If a read occurs to a non-present PTE, we will miss the "read,
   not present" case, because the execute-only VMA will have
   VM_EXEC set, and we will properly return 0 allowing the PTE to
   be populated.

Test program:

 int main()
 {
	int *p;
	p = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
	p[0] = 1;

	return 0;
 }

Reported-by: Andrey Wagin <avagin@gmail.com>,
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: linux-next@vger.kernel.org
Fixes: 62b5f7d013fc ("mm/core, x86/mm/pkeys: Add execute-only protection keys support")
Link: http://lkml.kernel.org/r/20160301194133.65D0110C@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5877b92ab6f1..6138db471b17 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1100,24 +1100,6 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
 	/* This is only called for the current mm, so: */
 	bool foreign = false;
-	/*
-	 * Access or read was blocked by protection keys. We do
-	 * this check before any others because we do not want
-	 * to, for instance, confuse a protection-key-denied
-	 * write with one for which we should do a COW.
-	 */
-	if (error_code & PF_PK)
-		return 1;
-
-	if (!(error_code & PF_INSTR)) {
-		/*
-		 * Assume all accesses require either read or execute
-		 * permissions.  This is not an instruction access, so
-		 * it requires read permissions.
-		 */
-		if (!(vma->vm_flags & VM_READ))
-			return 1;
-	}
 	/*
 	 * Make sure to check the VMA so that we do not perform
 	 * faults just to hit a PF_PK as soon as we fill in a
-- 
cgit 


From f9677e0f83080bb4186865868c359e72e1fac1ea Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 29 Feb 2016 06:33:47 -0800
Subject: x86/tsc: Always Running Timer (ART) correlated clocksource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On modern Intel systems TSC is derived from the new Always Running Timer
(ART). ART can be captured simultaneous to the capture of
audio and network device clocks, allowing a correlation between timebases
to be constructed. Upon capture, the driver converts the captured ART
value to the appropriate system clock using the correlated clocksource
mechanism.

On systems that support ART a new CPUID leaf (0x15) returns parameters
“m” and “n” such that:

TSC_value = (ART_value * m) / n + k [n >= 1]

[k is an offset that can adjusted by a privileged agent. The
IA32_TSC_ADJUST MSR is an example of an interface to adjust k.
See 17.14.4 of the Intel SDM for more details]

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Tweaked to fix build issue, also reworked math for
64bit division on 32bit systems, as well as !CONFIG_CPU_FREQ build
fixes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/x86/include/asm/cpufeature.h |  2 +-
 arch/x86/include/asm/tsc.h        |  2 ++
 arch/x86/kernel/tsc.c             | 59 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..ff557b49ab77 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -85,7 +85,7 @@
 #define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
 #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
 #define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ART		(3*32+10) /* Platform has always running timer (ART) */
 #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 6d7c5479bcea..174c4212780a 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void)
 	return rdtsc();
 }
 
+extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
+
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da828d3..80d761e420c5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
 /*
  * Use a ring-buffer like data structure, where a writer advances the head by
  * writing a new data entry and a reader advances the tail when it observes a
@@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc);
 
 #endif /* CONFIG_CPU_FREQ */
 
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void detect_art(void)
+{
+	unsigned int unused[2];
+
+	if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+		return;
+
+	cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+	      &art_to_tsc_numerator, unused, unused+1);
+
+	/* Don't enable ART in a VM, non-stop TSC required */
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+	    art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+		return;
+
+	if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+		return;
+
+	/* Make this sticky over multiple CPU init calls */
+	setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
 /* clocksource code */
 
 static struct clocksource clocksource_tsc;
@@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void)
 	return 0;
 }
 
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(cycle_t art)
+{
+	u64 tmp, res, rem;
+
+	rem = do_div(art, art_to_tsc_denominator);
+
+	res = art * art_to_tsc_numerator;
+	tmp = rem * art_to_tsc_numerator;
+
+	do_div(tmp, art_to_tsc_denominator);
+	res += tmp + art_to_tsc_offset;
+
+	return (struct system_counterval_t) {.cs = art_related_clocksource,
+			.cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
 
 static void tsc_refine_calibration_work(struct work_struct *work);
 static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
@@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 		(unsigned long)tsc_khz % 1000);
 
 out:
+	if (boot_cpu_has(X86_FEATURE_ART))
+		art_related_clocksource = &clocksource_tsc;
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
@@ -1235,6 +1292,8 @@ void __init tsc_init(void)
 		mark_tsc_unstable("TSCs unsynchronized");
 
 	check_system_tsc_reliable();
+
+	detect_art();
 }
 
 #ifdef CONFIG_SMP
-- 
cgit 


From 7dd0fdff145c5be7146d0ac06732ae3613412ac1 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:38 +0100
Subject: KVM: i8254: change PIT discard tick policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discard policy uses ack_notifiers to prevent injection of PIT interrupts
before EOI from the last one.

This patch changes the policy to always try to deliver the interrupt,
which makes a difference when its vector is in ISR.
Old implementation would drop the interrupt, but proposed one injects to
IRR, like real hardware would.

The old policy breaks legacy NMI watchdogs, where PIT is used through
virtual wire (LVT0): PIT never sends an interrupt before receiving EOI,
thus a guest deadlock with disabled interrupts will stop NMIs.

Note that NMI doesn't do EOI, so PIT also had to send a normal interrupt
through IOAPIC.  (KVM's PIT is deeply rotten and luckily not used much
in modern systems.)

Even though there is a chance of regressions, I think we can fix the
LVT0 NMI bug without introducing a new tick policy.

Cc: <stable@vger.kernel.org>
Reported-by: Yuki Shibuya <shibuya.yk@ncos.nec.co.jp>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b0ea42b78ccd..ab5318727579 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -245,7 +245,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 		 * PIC is being reset.  Handle it gracefully here
 		 */
 		atomic_inc(&ps->pending);
-	else if (value > 0)
+	else if (value > 0 && ps->reinject)
 		/* in this case, we had multiple outstanding pit interrupts
 		 * that we needed to inject.  Reinject
 		 */
@@ -288,7 +288,9 @@ static void pit_do_work(struct kthread_work *work)
 	 * last one has been acked.
 	 */
 	spin_lock(&ps->inject_lock);
-	if (ps->irq_ack) {
+	if (!ps->reinject)
+		inject = 1;
+	else if (ps->irq_ack) {
 		ps->irq_ack = 0;
 		inject = 1;
 	}
@@ -317,10 +319,10 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 	struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
 	struct kvm_pit *pt = ps->kvm->arch.vpit;
 
-	if (ps->reinject || !atomic_read(&ps->pending)) {
+	if (ps->reinject)
 		atomic_inc(&ps->pending);
-		queue_kthread_work(&pt->worker, &pt->expired);
-	}
+
+	queue_kthread_work(&pt->worker, &pt->expired);
 
 	if (ps->is_periodic) {
 		hrtimer_add_expires_ns(&ps->timer, ps->period);
-- 
cgit 


From f6e0a0c113f78a0e13c69762b2ab342d6c69220d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:39 +0100
Subject: KVM: i8254: simplify atomics in kvm_pit_ack_irq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already have a helper that does the same thing.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ab5318727579..7d694ac7f4a4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -236,19 +236,9 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	int value;
 
 	spin_lock(&ps->inject_lock);
-	value = atomic_dec_return(&ps->pending);
-	if (value < 0)
-		/* spurious acks can be generated if, for example, the
-		 * PIC is being reset.  Handle it gracefully here
-		 */
-		atomic_inc(&ps->pending);
-	else if (value > 0 && ps->reinject)
-		/* in this case, we had multiple outstanding pit interrupts
-		 * that we needed to inject.  Reinject
-		 */
+	if (atomic_dec_if_positive(&ps->pending) > 0 && ps->reinject)
 		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
 	ps->irq_ack = 1;
 	spin_unlock(&ps->inject_lock);
-- 
cgit 


From fd700a00dc2e821be92b0b56fd5d8ebf8c63f9ba Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:40 +0100
Subject: KVM: i8254: add kvm_pit_reset_reinject
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pit_state.pending and pit_state.irq_ack are always reset at the same
time.  Create a function for them.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7d694ac7f4a4..bdbb3f076e72 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -321,6 +321,12 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		return HRTIMER_NORESTART;
 }
 
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+	atomic_set(&pit->pit_state.pending, 0);
+	pit->pit_state.irq_ack = 1;
+}
+
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
@@ -343,8 +349,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	ps->timer.function = pit_timer_fn;
 	ps->kvm = ps->pit->kvm;
 
-	atomic_set(&ps->pending, 0);
-	ps->irq_ack = 1;
+	kvm_pit_reset_reinject(ps->pit);
 
 	/*
 	 * Do not allow the guest to program periodic timers with small
@@ -644,18 +649,15 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	}
 	mutex_unlock(&pit->pit_state.lock);
 
-	atomic_set(&pit->pit_state.pending, 0);
-	pit->pit_state.irq_ack = 1;
+	kvm_pit_reset_reinject(pit);
 }
 
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
-	if (!mask) {
-		atomic_set(&pit->pit_state.pending, 0);
-		pit->pit_state.irq_ack = 1;
-	}
+	if (!mask)
+		kvm_pit_reset_reinject(pit);
 }
 
 static const struct kvm_io_device_ops pit_dev_ops = {
-- 
cgit 


From ddf54503e2bbed01958cf5fb16ad6378971d2468 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:41 +0100
Subject: KVM: i8254: use atomic_t instead of pit.inject_lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lock was an overkill, the same can be done with atomics.

A mb() was added in kvm_pit_ack_irq, to pair with implicit barrier
between pit_timer_fn and pit_do_work.  The mb() prevents a race that
could happen if pending == 0 and irq_ack == 0:

  kvm_pit_ack_irq:                | pit_timer_fn:
   p = atomic_read(&ps->pending); |
                                  |  atomic_inc(&ps->pending);
                                  |  queue_work(pit_do_work);
                                  | pit_do_work:
                                  |  atomic_xchg(&ps->irq_ack, 0);
                                  |  return;
   atomic_set(&ps->irq_ack, 1);   |
   if (p == 0) return;            |

where the interrupt would not be delivered in this tick of pit_timer_fn.
PIT would have eventually delivered the interrupt, but we sacrifice
perofmance to make sure that interrupts are not needlessly delayed.

sfence isn't enough: atomic_dec_if_positive does atomic_read first and
x86 can reorder loads before stores.  lfence isn't enough: store can
pass lfence, turning it into a nop.  A compiler barrier would be more
than enough as CPU needs to stall for unbelievably long to use fences.

This patch doesn't do anything in kvm_pit_reset_reinject, because any
order of resets can race, but the result differs by at most one
interrupt, which is ok, because it's the same result as if the reset
happened at a slightly different time.  (Original code didn't protect
the reset path with a proper lock, so users have to be robust.)

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 56 +++++++++++++++++++++-------------------------------
 arch/x86/kvm/i8254.h |  3 +--
 2 files changed, 24 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index bdbb3f076e72..0f5655c50e0c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -237,11 +237,13 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
 
-	spin_lock(&ps->inject_lock);
+	atomic_set(&ps->irq_ack, 1);
+	/* irq_ack should be set before pending is read.  Order accesses with
+	 * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+	 */
+	smp_mb();
 	if (atomic_dec_if_positive(&ps->pending) > 0 && ps->reinject)
 		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
-	ps->irq_ack = 1;
-	spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -272,36 +274,25 @@ static void pit_do_work(struct kthread_work *work)
 	struct kvm_vcpu *vcpu;
 	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
-	int inject = 0;
 
-	/* Try to inject pending interrupts when
-	 * last one has been acked.
+	if (ps->reinject && !atomic_xchg(&ps->irq_ack, 0))
+		return;
+
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation does not follow
+	 * the MP specification.  We propagate a PIT interrupt to all
+	 * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
+	 * also be simultaneously delivered through PIC and IOAPIC.
 	 */
-	spin_lock(&ps->inject_lock);
-	if (!ps->reinject)
-		inject = 1;
-	else if (ps->irq_ack) {
-		ps->irq_ack = 0;
-		inject = 1;
-	}
-	spin_unlock(&ps->inject_lock);
-	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
-		/*
-		 * Provides NMI watchdog support via Virtual Wire mode.
-		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-		 *
-		 * Note: Our Virtual Wire implementation is simplified, only
-		 * propagating PIT interrupts to all VCPUs when they have set
-		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-		 * VCPU0, and only if its LVT0 is in EXTINT mode.
-		 */
-		if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
-			kvm_for_each_vcpu(i, vcpu, kvm)
-				kvm_apic_nmi_wd_deliver(vcpu);
-	}
+	if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
@@ -324,7 +315,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
 {
 	atomic_set(&pit->pit_state.pending, 0);
-	pit->pit_state.irq_ack = 1;
+	atomic_set(&pit->pit_state.irq_ack, 1);
 }
 
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
@@ -691,7 +682,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	spin_lock_init(&pit->pit_state.inject_lock);
 
 	pid = get_pid(task_tgid(current));
 	pid_nr = pid_vnr(pid);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index c84990b42b5b..f8cf4b84f435 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,8 +33,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	spinlock_t inject_lock;
-	unsigned long irq_ack;
+	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
-- 
cgit 


From b69d920f68b119bdc0483f0c33d34fd0c57724f5 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:42 +0100
Subject: KVM: i8254: tone down WARN_ON pit.state_lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the guest could hit this, it would hang the host kernel, bacause of
sheer number of those reports.  Internal callers have to be sensible
anyway, so we now only check for it in an API function.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0f5655c50e0c..e5a3e8015e30 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -76,8 +76,6 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
 	struct kvm_kpit_channel_state *c =
 		&kvm->arch.vpit->pit_state.channels[channel];
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	switch (c->mode) {
 	default:
 	case 0:
@@ -99,8 +97,6 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
 
 static int pit_get_gate(struct kvm *kvm, int channel)
 {
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	return kvm->arch.vpit->pit_state.channels[channel].gate;
 }
 
@@ -144,8 +140,6 @@ static int pit_get_count(struct kvm *kvm, int channel)
 	s64 d, t;
 	int counter;
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
@@ -174,8 +168,6 @@ static int pit_get_out(struct kvm *kvm, int channel)
 	s64 d, t;
 	int out;
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
@@ -207,8 +199,6 @@ static void pit_latch_count(struct kvm *kvm, int channel)
 	struct kvm_kpit_channel_state *c =
 		&kvm->arch.vpit->pit_state.channels[channel];
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	if (!c->count_latched) {
 		c->latched_count = pit_get_count(kvm, channel);
 		c->count_latched = c->rw_mode;
@@ -220,8 +210,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	struct kvm_kpit_channel_state *c =
 		&kvm->arch.vpit->pit_state.channels[channel];
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
 	if (!c->status_latched) {
 		/* TODO: Return NULL COUNT (bit 6). */
 		c->status = ((pit_get_out(kvm, channel) << 7) |
@@ -367,8 +355,6 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
-	WARN_ON(!mutex_is_locked(&ps->lock));
-
 	pr_debug("load_count val is %d, channel is %d\n", val, channel);
 
 	/*
@@ -406,6 +392,9 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
 {
 	u8 saved_mode;
+
+	WARN_ON_ONCE(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
 	if (hpet_legacy_start) {
 		/* save existing mode for later reenablement */
 		WARN_ON(channel != 0);
-- 
cgit 


From 09edea72b7f9fd8a8d26c1f7504d989b9773ee5e Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:43 +0100
Subject: KVM: i8254: pass struct kvm_pit instead of kvm in PIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch passes struct kvm_pit into internal PIT functions.
Those functions used to get PIT through kvm->arch.vpit, even though most
of them never used *kvm for other purposes.  Another benefit is that we
don't need to set kvm->arch.vpit during initialization.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 112 ++++++++++++++++++++++++---------------------------
 arch/x86/kvm/i8254.h |   4 +-
 arch/x86/kvm/x86.c   |  26 +++++++-----
 3 files changed, 70 insertions(+), 72 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e5a3e8015e30..2afe09b054e7 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -71,10 +71,9 @@ static u64 muldiv64(u64 a, u32 b, u32 c)
 	return res.ll;
 }
 
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	switch (c->mode) {
 	default:
@@ -95,16 +94,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
 	c->gate = val;
 }
 
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
 {
-	return kvm->arch.vpit->pit_state.channels[channel].gate;
+	return pit->pit_state.channels[channel].gate;
 }
 
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
 {
 	s64 elapsed;
 	ktime_t remaining;
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	struct kvm_kpit_state *ps = &pit->pit_state;
 
 	if (!ps->period)
 		return 0;
@@ -124,23 +123,22 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	return elapsed;
 }
 
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
 			int channel)
 {
 	if (channel == 0)
-		return __kpit_elapsed(kvm);
+		return __kpit_elapsed(pit);
 
 	return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
 }
 
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 	s64 d, t;
 	int counter;
 
-	t = kpit_elapsed(kvm, c, channel);
+	t = kpit_elapsed(pit, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -161,14 +159,13 @@ static int pit_get_count(struct kvm *kvm, int channel)
 	return counter;
 }
 
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 	s64 d, t;
 	int out;
 
-	t = kpit_elapsed(kvm, c, channel);
+	t = kpit_elapsed(pit, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -194,25 +191,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
 	return out;
 }
 
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	if (!c->count_latched) {
-		c->latched_count = pit_get_count(kvm, channel);
+		c->latched_count = pit_get_count(pit, channel);
 		c->count_latched = c->rw_mode;
 	}
 }
 
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	if (!c->status_latched) {
 		/* TODO: Return NULL COUNT (bit 6). */
-		c->status = ((pit_get_out(kvm, channel) << 7) |
+		c->status = ((pit_get_out(pit, channel) << 7) |
 				(c->rw_mode << 4) |
 				(c->mode << 1) |
 				c->bcd);
@@ -306,9 +301,10 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
 	atomic_set(&pit->pit_state.irq_ack, 1);
 }
 
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 {
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
 	s64 interval;
 
 	if (!ioapic_in_kernel(kvm) ||
@@ -326,9 +322,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	ps->is_periodic = is_period;
 
 	ps->timer.function = pit_timer_fn;
-	ps->kvm = ps->pit->kvm;
+	ps->kvm = pit->kvm;
 
-	kvm_pit_reset_reinject(ps->pit);
+	kvm_pit_reset_reinject(pit);
 
 	/*
 	 * Do not allow the guest to program periodic timers with small
@@ -351,9 +347,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 		      HRTIMER_MODE_ABS);
 }
 
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
 {
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	struct kvm_kpit_state *ps = &pit->pit_state;
 
 	pr_debug("load_count val is %d, channel is %d\n", val, channel);
 
@@ -378,32 +374,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		create_pit_timer(kvm, val, 0);
+		create_pit_timer(pit, val, 0);
 		break;
 	case 2:
 	case 3:
-		create_pit_timer(kvm, val, 1);
+		create_pit_timer(pit, val, 1);
 		break;
 	default:
-		destroy_pit_timer(kvm->arch.vpit);
+		destroy_pit_timer(pit);
 	}
 }
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+		int hpet_legacy_start)
 {
 	u8 saved_mode;
 
-	WARN_ON_ONCE(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+	WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
 
 	if (hpet_legacy_start) {
 		/* save existing mode for later reenablement */
 		WARN_ON(channel != 0);
-		saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
-		kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
-		pit_load_count(kvm, channel, val);
-		kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+		saved_mode = pit->pit_state.channels[0].mode;
+		pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+		pit_load_count(pit, channel, val);
+		pit->pit_state.channels[0].mode = saved_mode;
 	} else {
-		pit_load_count(kvm, channel, val);
+		pit_load_count(pit, channel, val);
 	}
 }
 
@@ -429,7 +426,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	int channel, access;
 	struct kvm_kpit_channel_state *s;
 	u32 val = *(u32 *) data;
@@ -453,9 +449,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 				s = &pit_state->channels[channel];
 				if (val & (2 << channel)) {
 					if (!(val & 0x20))
-						pit_latch_count(kvm, channel);
+						pit_latch_count(pit, channel);
 					if (!(val & 0x10))
-						pit_latch_status(kvm, channel);
+						pit_latch_status(pit, channel);
 				}
 			}
 		} else {
@@ -463,7 +459,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 			s = &pit_state->channels[channel];
 			access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
 			if (access == 0) {
-				pit_latch_count(kvm, channel);
+				pit_latch_count(pit, channel);
 			} else {
 				s->rw_mode = access;
 				s->read_state = access;
@@ -480,17 +476,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 		switch (s->write_state) {
 		default:
 		case RW_STATE_LSB:
-			pit_load_count(kvm, addr, val);
+			pit_load_count(pit, addr, val);
 			break;
 		case RW_STATE_MSB:
-			pit_load_count(kvm, addr, val << 8);
+			pit_load_count(pit, addr, val << 8);
 			break;
 		case RW_STATE_WORD0:
 			s->write_latch = val;
 			s->write_state = RW_STATE_WORD1;
 			break;
 		case RW_STATE_WORD1:
-			pit_load_count(kvm, addr, s->write_latch | (val << 8));
+			pit_load_count(pit, addr, s->write_latch | (val << 8));
 			s->write_state = RW_STATE_WORD0;
 			break;
 		}
@@ -506,7 +502,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	int ret, count;
 	struct kvm_kpit_channel_state *s;
 	if (!pit_in_range(addr))
@@ -543,20 +538,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
 		switch (s->read_state) {
 		default:
 		case RW_STATE_LSB:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = count & 0xff;
 			break;
 		case RW_STATE_MSB:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = (count >> 8) & 0xff;
 			break;
 		case RW_STATE_WORD0:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = count & 0xff;
 			s->read_state = RW_STATE_WORD1;
 			break;
 		case RW_STATE_WORD1:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = (count >> 8) & 0xff;
 			s->read_state = RW_STATE_WORD0;
 			break;
@@ -577,14 +572,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	u32 val = *(u32 *) data;
 	if (addr != KVM_SPEAKER_BASE_ADDRESS)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&pit_state->lock);
 	pit_state->speaker_data_on = (val >> 1) & 1;
-	pit_set_gate(kvm, 2, val & 1);
+	pit_set_gate(pit, 2, val & 1);
 	mutex_unlock(&pit_state->lock);
 	return 0;
 }
@@ -595,7 +589,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	unsigned int refresh_clock;
 	int ret;
 	if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -605,8 +598,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 	refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
 
 	mutex_lock(&pit_state->lock);
-	ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
-		(pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+	ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+		(pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
 	if (len > sizeof(ret))
 		len = sizeof(ret);
 	memcpy(data, (char *)&ret, len);
@@ -625,7 +618,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
 		c = &pit->pit_state.channels[i];
 		c->mode = 0xff;
 		c->gate = (i != 2);
-		pit_load_count(pit->kvm, i, 0);
+		pit_load_count(pit, i, 0);
 	}
 	mutex_unlock(&pit->pit_state.lock);
 
@@ -687,7 +680,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	}
 	init_kthread_work(&pit->expired, pit_do_work);
 
-	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
 
 	pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index f8cf4b84f435..a6aceaf08df5 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -56,9 +56,11 @@ struct kvm_pit {
 #define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
 #define KVM_PIT_CHANNEL_MASK	    0x3
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
+
 void kvm_pit_reset(struct kvm_pit *pit);
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+		int hpet_legacy_start);
 
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4654e4150b0..a88e1a3eeb69 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3613,11 +3613,13 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int i;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	mutex_lock(&pit->pit_state.lock);
+	memcpy(&pit->pit_state, ps, sizeof(struct kvm_pit_state));
 	for (i = 0; i < 3; i++)
-		kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 
@@ -3637,18 +3639,20 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	int start = 0;
 	int i;
 	u32 prev_legacy, cur_legacy;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	mutex_lock(&pit->pit_state.lock);
+	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	if (!prev_legacy && cur_legacy)
 		start = 1;
-	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-	       sizeof(kvm->arch.vpit->pit_state.channels));
-	kvm->arch.vpit->pit_state.flags = ps->flags;
+	memcpy(&pit->pit_state.channels, &ps->channels,
+	       sizeof(pit->pit_state.channels));
+	pit->pit_state.flags = ps->flags;
 	for (i = 0; i < 3; i++)
-		kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
 				   start && i == 0);
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 
-- 
cgit 


From b39c90b6560c4458d46cb243abea0dcb7bc126d1 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:44 +0100
Subject: KVM: i8254: remove unnecessary uses of PIT state lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- kvm_create_pit had to lock only because it exposed kvm->arch.vpit very
  early, but initialization doesn't use kvm->arch.vpit since the last
  patch, so we can drop locking.
- kvm_free_pit is only run after there are no users of KVM and therefore
  is the sole actor.
- Locking in kvm_vm_ioctl_reinject doesn't do anything, because reinject
  is only protected at that place.
- kvm_pit_reset isn't used anywhere and its locking can be dropped if we
  hide it.

Removing useless locking allows to see what actually is being protected
by PIT state lock (values accessible from the guest).

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 9 +--------
 arch/x86/kvm/i8254.h | 9 +++++----
 arch/x86/kvm/x86.c   | 4 ++--
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 2afe09b054e7..b8582fbe4fcf 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -607,12 +607,11 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
 {
 	int i;
 	struct kvm_kpit_channel_state *c;
 
-	mutex_lock(&pit->pit_state.lock);
 	pit->pit_state.flags = 0;
 	for (i = 0; i < 3; i++) {
 		c = &pit->pit_state.channels[i];
@@ -620,7 +619,6 @@ void kvm_pit_reset(struct kvm_pit *pit)
 		c->gate = (i != 2);
 		pit_load_count(pit, i, 0);
 	}
-	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset_reinject(pit);
 }
@@ -663,7 +661,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	}
 
 	mutex_init(&pit->pit_state.lock);
-	mutex_lock(&pit->pit_state.lock);
 
 	pid = get_pid(task_tgid(current));
 	pid_nr = pid_vnr(pid);
@@ -673,7 +670,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
 				       "kvm-pit/%d", pid_nr);
 	if (IS_ERR(pit->worker_task)) {
-		mutex_unlock(&pit->pit_state.lock);
 		kvm_free_irq_source_id(kvm, pit->irq_source_id);
 		kfree(pit);
 		return NULL;
@@ -689,7 +685,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
 	pit_state->reinject = true;
-	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
 
@@ -737,13 +732,11 @@ void kvm_free_pit(struct kvm *kvm)
 					       &kvm->arch.vpit->mask_notifier);
 		kvm_unregister_irq_ack_notifier(kvm,
 				&kvm->arch.vpit->pit_state.irq_ack_notifier);
-		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.timer;
 		hrtimer_cancel(timer);
 		flush_kthread_work(&kvm->arch.vpit->expired);
 		kthread_stop(kvm->arch.vpit->worker_task);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
-		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 		kfree(kvm->arch.vpit);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a6aceaf08df5..840fbb3cb626 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -22,17 +22,19 @@ struct kvm_kpit_channel_state {
 };
 
 struct kvm_kpit_state {
+	/* All members before "struct mutex lock" are protected by the lock. */
 	struct kvm_kpit_channel_state channels[3];
 	u32 flags;
 	bool is_periodic;
 	s64 period; 				/* unit: ns */
 	struct hrtimer timer;
-	atomic_t pending;			/* accumulated triggered timers */
-	bool reinject;
-	struct kvm *kvm;
 	u32    speaker_data_on;
+
 	struct mutex lock;
+	struct kvm *kvm;
 	struct kvm_pit *pit;
+	bool reinject;
+	atomic_t pending; /* accumulated triggered timers */
 	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
@@ -59,7 +61,6 @@ struct kvm_pit {
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
 
-void kvm_pit_reset(struct kvm_pit *pit);
 void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
 		int hpet_legacy_start);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a88e1a3eeb69..ce4e91db5bae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3661,9 +3661,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 {
 	if (!kvm->arch.vpit)
 		return -ENXIO;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
+
 	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+
 	return 0;
 }
 
-- 
cgit 


From 71474e2f0f439b83b7b53ee6e9cf4f44c15b5806 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:45 +0100
Subject: KVM: i8254: remove notifiers from PIT discard policy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discard policy doesn't rely on information from notifiers, so we don't
need to register notifiers unconditionally.  We kept correct counts in
case userspace switched between policies during runtime, but that can be
avoided by reseting the state.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 38 +++++++++++++++++++++++++++-----------
 arch/x86/kvm/i8254.h |  1 +
 arch/x86/kvm/x86.c   | 12 ++++++++++--
 3 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b8582fbe4fcf..7a2f14bdf4b5 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -225,7 +225,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	 * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
 	 */
 	smp_mb();
-	if (atomic_dec_if_positive(&ps->pending) > 0 && ps->reinject)
+	if (atomic_dec_if_positive(&ps->pending) > 0)
 		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
 }
 
@@ -301,6 +301,27 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
 	atomic_set(&pit->pit_state.irq_ack, 1);
 }
 
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+
+	if (ps->reinject == reinject)
+		return;
+
+	if (reinject) {
+		/* The initial state is preserved while ps->reinject == 0. */
+		kvm_pit_reset_reinject(pit);
+		kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+		kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	} else {
+		kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+		kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	}
+
+	ps->reinject = reinject;
+}
+
 static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &pit->pit_state;
@@ -681,15 +702,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pit_state = &pit->pit_state;
 	pit_state->pit = pit;
 	hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
-	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-	pit_state->reinject = true;
+	pit->mask_notifier.func = pit_mask_notifer;
 
 	kvm_pit_reset(pit);
 
-	pit->mask_notifier.func = pit_mask_notifer;
-	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	kvm_pit_set_reinject(pit, true);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
 	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
@@ -712,8 +732,7 @@ fail_unregister:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
 
 fail:
-	kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
-	kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+	kvm_pit_set_reinject(pit, false);
 	kvm_free_irq_source_id(kvm, pit->irq_source_id);
 	kthread_stop(pit->worker_task);
 	kfree(pit);
@@ -728,10 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
 		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
 		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
 					      &kvm->arch.vpit->speaker_dev);
-		kvm_unregister_irq_mask_notifier(kvm, 0,
-					       &kvm->arch.vpit->mask_notifier);
-		kvm_unregister_irq_ack_notifier(kvm,
-				&kvm->arch.vpit->pit_state.irq_ack_notifier);
+		kvm_pit_set_reinject(kvm->arch.vpit, false);
 		timer = &kvm->arch.vpit->pit_state.timer;
 		hrtimer_cancel(timer);
 		flush_kthread_work(&kvm->arch.vpit->expired);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 840fbb3cb626..1945635904a7 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -63,5 +63,6 @@ void kvm_free_pit(struct kvm *kvm);
 
 void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
 		int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
 
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce4e91db5bae..76f9f48898a5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3659,10 +3659,18 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 				 struct kvm_reinject_control *control)
 {
-	if (!kvm->arch.vpit)
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	if (!pit)
 		return -ENXIO;
 
-	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
+	/* pit->pit_state.lock was overloaded to prevent userspace from getting
+	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+	 */
+	mutex_lock(&pit->pit_state.lock);
+	kvm_pit_set_reinject(pit, control->pit_reinject);
+	mutex_unlock(&pit->pit_state.lock);
 
 	return 0;
 }
-- 
cgit 


From 10d2482126d02682d5d21aa5ecdf76d5f49c6740 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:46 +0100
Subject: KVM: i8254: refactor kvm_create_pit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Locks are gone, so we don't need to duplicate error paths.
Use goto everywhere.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7a2f14bdf4b5..c24735ae1871 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -676,10 +676,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 		return NULL;
 
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	if (pit->irq_source_id < 0) {
-		kfree(pit);
-		return NULL;
-	}
+	if (pit->irq_source_id < 0)
+		goto fail_request;
 
 	mutex_init(&pit->pit_state.lock);
 
@@ -690,11 +688,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	init_kthread_worker(&pit->worker);
 	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
 				       "kvm-pit/%d", pid_nr);
-	if (IS_ERR(pit->worker_task)) {
-		kvm_free_irq_source_id(kvm, pit->irq_source_id);
-		kfree(pit);
-		return NULL;
-	}
+	if (IS_ERR(pit->worker_task))
+		goto fail_kthread;
+
 	init_kthread_work(&pit->expired, pit_do_work);
 
 	pit->kvm = kvm;
@@ -715,7 +711,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
 				      KVM_PIT_MEM_LENGTH, &pit->dev);
 	if (ret < 0)
-		goto fail;
+		goto fail_register_pit;
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
@@ -723,18 +719,19 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 					      KVM_SPEAKER_BASE_ADDRESS, 4,
 					      &pit->speaker_dev);
 		if (ret < 0)
-			goto fail_unregister;
+			goto fail_register_speaker;
 	}
 
 	return pit;
 
-fail_unregister:
+fail_register_speaker:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
+fail_register_pit:
 	kvm_pit_set_reinject(pit, false);
-	kvm_free_irq_source_id(kvm, pit->irq_source_id);
 	kthread_stop(pit->worker_task);
+fail_kthread:
+	kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
 	kfree(pit);
 	return NULL;
 }
-- 
cgit 


From 08e5ccf3ae8f2c5c8319fec521e312f7e775cb84 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:47 +0100
Subject: KVM: i8254: refactor kvm_free_pit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Could be easier to read, but git history will become deeper.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c24735ae1871..055018e833b6 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -738,18 +738,16 @@ fail_request:
 
 void kvm_free_pit(struct kvm *kvm)
 {
-	struct hrtimer *timer;
-
-	if (kvm->arch.vpit) {
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-					      &kvm->arch.vpit->speaker_dev);
-		kvm_pit_set_reinject(kvm->arch.vpit, false);
-		timer = &kvm->arch.vpit->pit_state.timer;
-		hrtimer_cancel(timer);
-		flush_kthread_work(&kvm->arch.vpit->expired);
-		kthread_stop(kvm->arch.vpit->worker_task);
-		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
-		kfree(kvm->arch.vpit);
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	if (pit) {
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+		kvm_pit_set_reinject(pit, false);
+		hrtimer_cancel(&pit->pit_state.timer);
+		flush_kthread_work(&pit->expired);
+		kthread_stop(pit->worker_task);
+		kvm_free_irq_source_id(kvm, pit->irq_source_id);
+		kfree(pit);
 	}
 }
-- 
cgit 


From a3e1311549593692d98cba1ceb479f070dcd9873 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:48 +0100
Subject: KVM: i8254: remove pit and kvm from kvm_kpit_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm isn't ever used and pit can be accessed with container_of.
If you *really* need kvm, pit_state_to_pit(ps)->kvm.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 14 +++++++++-----
 arch/x86/kvm/i8254.h |  2 --
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 055018e833b6..37e665c5f307 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -215,10 +215,16 @@ static void pit_latch_status(struct kvm_pit *pit, int channel)
 	}
 }
 
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+	return container_of(ps, struct kvm_pit, pit_state);
+}
+
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
+	struct kvm_pit *pit = pit_state_to_pit(ps);
 
 	atomic_set(&ps->irq_ack, 1);
 	/* irq_ack should be set before pending is read.  Order accesses with
@@ -226,7 +232,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	 */
 	smp_mb();
 	if (atomic_dec_if_positive(&ps->pending) > 0)
-		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
+		queue_kthread_work(&pit->worker, &pit->expired);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -281,7 +287,7 @@ static void pit_do_work(struct kthread_work *work)
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
-	struct kvm_pit *pt = ps->kvm->arch.vpit;
+	struct kvm_pit *pt = pit_state_to_pit(ps);
 
 	if (ps->reinject)
 		atomic_inc(&ps->pending);
@@ -338,12 +344,11 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&ps->timer);
-	flush_kthread_work(&ps->pit->expired);
+	flush_kthread_work(&pit->expired);
 	ps->period = interval;
 	ps->is_periodic = is_period;
 
 	ps->timer.function = pit_timer_fn;
-	ps->kvm = pit->kvm;
 
 	kvm_pit_reset_reinject(pit);
 
@@ -696,7 +701,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pit->kvm = kvm;
 
 	pit_state = &pit->pit_state;
-	pit_state->pit = pit;
 	hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 
 	pit_state->irq_ack_notifier.gsi = 0;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 1945635904a7..f365dce4fb8d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -31,8 +31,6 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 
 	struct mutex lock;
-	struct kvm *kvm;
-	struct kvm_pit *pit;
 	bool reinject;
 	atomic_t pending; /* accumulated triggered timers */
 	atomic_t irq_ack;
-- 
cgit 


From 4a2095df8ab1671078c46d988e1b83baf42db4f6 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:49 +0100
Subject: KVM: i8254: remove pointless dereference of PIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PIT is known at that point.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 37e665c5f307..964902b33eed 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -267,8 +267,8 @@ static void pit_do_work(struct kthread_work *work)
 	if (ps->reinject && !atomic_xchg(&ps->irq_ack, 0))
 		return;
 
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
+	kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
 
 	/*
 	 * Provides NMI watchdog support via Virtual Wire mode.
-- 
cgit 


From 34f3941c42e22217e39f442e15401ae53a23f8cb Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:50 +0100
Subject: KVM: i8254: don't assume layout of kvm_kpit_state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

channels has offset 0 and correct size now, but that can change.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76f9f48898a5..60d6c0036a98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3604,9 +3604,13 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+	mutex_lock(&kps->lock);
+	memcpy(ps, &kps->channels, sizeof(*ps));
+	mutex_unlock(&kps->lock);
 	return 0;
 }
 
@@ -3616,7 +3620,7 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 	struct kvm_pit *pit = kvm->arch.vpit;
 
 	mutex_lock(&pit->pit_state.lock);
-	memcpy(&pit->pit_state, ps, sizeof(struct kvm_pit_state));
+	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
 	for (i = 0; i < 3; i++)
 		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
 	mutex_unlock(&pit->pit_state.lock);
-- 
cgit 


From ab4c14763b434d48bc7732e475ff4d5b6b9d3e3b Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:51 +0100
Subject: KVM: i8254: move PIT timer function initialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We can do it just once.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 964902b33eed..68af4445d51d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -348,8 +348,6 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 	ps->period = interval;
 	ps->is_periodic = is_period;
 
-	ps->timer.function = pit_timer_fn;
-
 	kvm_pit_reset_reinject(pit);
 
 	/*
@@ -702,6 +700,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	pit_state = &pit->pit_state;
 	hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	pit_state->timer.function = pit_timer_fn;
 
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
-- 
cgit 


From a0aace5ac0efdb2bcb71e10d9c9ca6a851fa59f9 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 2 Mar 2016 22:56:52 +0100
Subject: KVM: i8254: turn kvm_kpit_state.reinject into atomic_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document possible races between readers and concurrent update to the
ioctl.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 8 ++++----
 arch/x86/kvm/i8254.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 68af4445d51d..219ef855aae5 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -264,7 +264,7 @@ static void pit_do_work(struct kthread_work *work)
 	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
 
-	if (ps->reinject && !atomic_xchg(&ps->irq_ack, 0))
+	if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
 		return;
 
 	kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
@@ -289,7 +289,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 	struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
 	struct kvm_pit *pt = pit_state_to_pit(ps);
 
-	if (ps->reinject)
+	if (atomic_read(&ps->reinject))
 		atomic_inc(&ps->pending);
 
 	queue_kthread_work(&pt->worker, &pt->expired);
@@ -312,7 +312,7 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
 	struct kvm_kpit_state *ps = &pit->pit_state;
 	struct kvm *kvm = pit->kvm;
 
-	if (ps->reinject == reinject)
+	if (atomic_read(&ps->reinject) == reinject)
 		return;
 
 	if (reinject) {
@@ -325,7 +325,7 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
 		kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 	}
 
-	ps->reinject = reinject;
+	atomic_set(&ps->reinject, reinject);
 }
 
 static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index f365dce4fb8d..2f5af0798326 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -31,7 +31,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 
 	struct mutex lock;
-	bool reinject;
+	atomic_t reinject;
 	atomic_t pending; /* accumulated triggered timers */
 	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
-- 
cgit 


From 8f22372f85476bfe612136133d5883b28d163c23 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 26 Feb 2016 12:09:49 +0100
Subject: KVM: VMX: use vmcs_clear/set_bits for debug register exits

Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aa16d5874fe6..46154dac71e6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5619,11 +5619,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	}
 
 	if (vcpu->guest_debug == 0) {
-		u32 cpu_based_vm_exec_control;
-
-		cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-		cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+				CPU_BASED_MOV_DR_EXITING);
 
 		/*
 		 * No more DR vmexits; force a reload of the debug registers
@@ -5660,8 +5657,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
 	get_debugreg(vcpu->arch.db[0], 0);
 	get_debugreg(vcpu->arch.db[1], 1);
 	get_debugreg(vcpu->arch.db[2], 2);
@@ -5670,10 +5665,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
 	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
-- 
cgit 


From 798e88b31fbe9863163054feb8432e62e77f539c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Feb 2016 15:28:51 +0100
Subject: KVM: MMU: cleanup handle_abnormal_pfn

The goto and temporary variable are unnecessary, just use return
statements.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 42ca0acc1c4e..bb223f8f3440 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2821,20 +2821,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 				kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
-	bool ret = true;
-
 	/* The pfn is invalid, report the error! */
 	if (unlikely(is_error_pfn(pfn))) {
 		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-		goto exit;
+		return true;
 	}
 
 	if (unlikely(is_noslot_pfn(pfn)))
 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
 
-	ret = false;
-exit:
-	return ret;
+	return false;
 }
 
 static bool page_fault_can_be_fast(u32 error_code)
-- 
cgit 


From 0a47cd85833e56574a926cad309726f4f7859544 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Feb 2016 13:54:25 +0100
Subject: KVM: MMU: Fix ubsan warnings

kvm_mmu_pages_init is doing some really yucky stuff.  It is setting
up a sentinel for mmu_page_clear_parents; however, because of a) the
way levels are numbered starting from 1 and b) the way mmu_page_path
sizes its arrays with PT64_ROOT_LEVEL-1 elements, the access can be
out of bounds.  This is harmless because the code overwrites up to the
first two elements of parents->idx and these are initialized, and
because the sentinel is not needed in this case---mmu_page_clear_parents
exits anyway when it gets to the end of the array.  However ubsan
complains, and everyone else should too.

This fix does three things.  First it makes the mmu_page_path arrays
PT64_ROOT_LEVEL elements in size, so that we can write to them without
checking the level in advance.  Second it disintegrates kvm_mmu_pages_init
between mmu_unsync_walk (to reset the struct kvm_mmu_pages) and
for_each_sp (to place the NULL sentinel at the end of the current path).
This is okay because the mmu_page_path is only used in
mmu_pages_clear_parents; mmu_pages_clear_parents itself is called within
a for_each_sp iterator, and hence always after a call to mmu_pages_next.
Third it changes mmu_pages_clear_parents to just use the sentinel to
stop iteration, without checking the bounds on level.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Mike Krinkin <krinkin.m.u@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 57 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bb223f8f3440..609fa5322f6a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1873,6 +1873,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_mmu_pages *pvec)
 {
+	pvec->nr = 0;
 	if (!sp->unsync_children)
 		return 0;
 
@@ -1986,13 +1987,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 }
 
 struct mmu_page_path {
-	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-	unsigned int idx[PT64_ROOT_LEVEL-1];
+	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+	unsigned int idx[PT64_ROOT_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)			\
-		for (i = mmu_pages_next(&pvec, &parents, -1),	\
-			sp = pvec.page[i].sp;			\
+		for (i = mmu_pages_first(&pvec, &parents);	\
 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
 			i = mmu_pages_next(&pvec, &parents, i))
 
@@ -2004,19 +2004,41 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
 
 	for (n = i+1; n < pvec->nr; n++) {
 		struct kvm_mmu_page *sp = pvec->page[n].sp;
+		unsigned idx = pvec->page[n].idx;
+		int level = sp->role.level;
 
-		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-			parents->idx[0] = pvec->page[n].idx;
-			return n;
-		}
+		parents->idx[level-1] = idx;
+		if (level == PT_PAGE_TABLE_LEVEL)
+			break;
 
-		parents->parent[sp->role.level-2] = sp;
-		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+		parents->parent[level-2] = sp;
 	}
 
 	return n;
 }
 
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+			   struct mmu_page_path *parents)
+{
+	struct kvm_mmu_page *sp;
+	int level;
+
+	if (pvec->nr == 0)
+		return 0;
+
+	sp = pvec->page[0].sp;
+	level = sp->role.level;
+	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+	parents->parent[level-2] = sp;
+
+	/* Also set up a sentinel.  Further entries in pvec are all
+	 * children of sp, so this element is never overwritten.
+	 */
+	parents->parent[level-1] = NULL;
+	return mmu_pages_next(pvec, parents, 0);
+}
+
 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
 	struct kvm_mmu_page *sp;
@@ -2024,22 +2046,13 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 
 	do {
 		unsigned int idx = parents->idx[level];
-
 		sp = parents->parent[level];
 		if (!sp)
 			return;
 
 		clear_unsync_child_bit(sp, idx);
 		level++;
-	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-			       struct mmu_page_path *parents,
-			       struct kvm_mmu_pages *pvec)
-{
-	parents->parent[parent->role.level-1] = NULL;
-	pvec->nr = 0;
+	} while (!sp->unsync_children);
 }
 
 static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2051,7 +2064,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_pages pages;
 	LIST_HEAD(invalid_list);
 
-	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
 		bool protected = false;
 
@@ -2067,7 +2079,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		}
 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
 }
 
@@ -2305,7 +2316,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
 		return 0;
 
-	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
 		struct kvm_mmu_page *sp;
 
@@ -2314,7 +2324,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 			mmu_pages_clear_parents(&parents);
 			zapped++;
 		}
-		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
 
 	return zapped;
-- 
cgit 


From e23d3fef83df8e303fded0ab55b379beec0dd604 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 24 Feb 2016 09:46:06 +0100
Subject: KVM: MMU: check kvm_mmu_pages and mmu_page_path indices

Give a special invalid index to the root of the walk, so that we
can check the consistency of kvm_mmu_pages and mmu_page_path.

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
[Extracted from a bigger patch proposed by Guangrong. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 609fa5322f6a..0a4dc9b54181 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1870,6 +1870,8 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	return nr_unsync_leaf;
 }
 
+#define INVALID_INDEX (-1)
+
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_mmu_pages *pvec)
 {
@@ -1877,7 +1879,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (!sp->unsync_children)
 		return 0;
 
-	mmu_pages_add(pvec, sp, 0);
+	mmu_pages_add(pvec, sp, INVALID_INDEX);
 	return __mmu_unsync_walk(sp, pvec);
 }
 
@@ -2026,6 +2028,8 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
 	if (pvec->nr == 0)
 		return 0;
 
+	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
 	sp = pvec->page[0].sp;
 	level = sp->role.level;
 	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
@@ -2050,6 +2054,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 		if (!sp)
 			return;
 
+		WARN_ON(idx == INVALID_INDEX);
 		clear_unsync_child_bit(sp, idx);
 		level++;
 	} while (!sp->unsync_children);
-- 
cgit 


From 4babf2c5efb79be778d45ebe95d9c9b3c880c83b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Mar 2016 16:04:00 +0100
Subject: x86: wire up preadv2 and pwritev2

Signed-off-by: Milosz Tanski <milosz@adfin.com>
[hch: rebased due to newly added syscalls]
Reviewed-by: Stephen Bates <stephen.bates@pmcs.com>
Tested-by: Stephen Bates <stephen.bates@pmcs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 2 ++
 arch/x86/entry/syscalls/syscall_64.tbl | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index cb713df81180..b30dd8154cc2 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -384,3 +384,5 @@
 375	i386	membarrier		sys_membarrier
 376	i386	mlock2			sys_mlock2
 377	i386	copy_file_range		sys_copy_file_range
+378	i386	preadv2			sys_preadv2
+379	i386	pwritev2		sys_pwritev2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a50bdc..31cec929eb8d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -333,6 +333,8 @@
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
 326	common	copy_file_range		sys_copy_file_range
+327	64	preadv2			sys_preadv2
+328	64	pwritev2		sys_pwritev2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
cgit 


From 0e4d44151af7c8fca3d15c27d9b97d4ac41c102b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Mar 2016 09:28:41 +0100
Subject: KVM: i8254: drop local copy of mul_u64_u32_div
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A function that does the same as i8254.c's muldiv64 has been added
(for KVM's own use, in fact!) in include/linux/math64.h.  Use it
instead of muldiv64.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8254.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 219ef855aae5..a4bf5b45d65a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -51,26 +51,6 @@
 #define RW_STATE_WORD0 3
 #define RW_STATE_WORD1 4
 
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
-{
-	union {
-		u64 ll;
-		struct {
-			u32 low, high;
-		} l;
-	} u, res;
-	u64 rl, rh;
-
-	u.ll = a;
-	rl = (u64)u.l.low * (u64)b;
-	rh = (u64)u.l.high * (u64)b;
-	rh += (rl >> 32);
-	res.l.high = div64_u64(rh, c);
-	res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
-	return res.ll;
-}
-
 static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
 {
 	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
@@ -139,7 +119,7 @@ static int pit_get_count(struct kvm_pit *pit, int channel)
 	int counter;
 
 	t = kpit_elapsed(pit, c, channel);
-	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+	d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
 	case 0:
@@ -166,7 +146,7 @@ static int pit_get_out(struct kvm_pit *pit, int channel)
 	int out;
 
 	t = kpit_elapsed(pit, c, channel);
-	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+	d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
 	default:
@@ -338,7 +318,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
-	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+	interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
 
 	pr_debug("create pit timer, interval is %llu nsec\n", interval);
 
-- 
cgit 


From cfa52c0cfa4d727aa3e457bf29aeff296c528a08 Mon Sep 17 00:00:00 2001
From: Karol Herbst <nouveau@karolherbst.de>
Date: Thu, 3 Mar 2016 02:03:11 +0100
Subject: x86/mm/kmmio: Fix mmiotrace for hugepages

Because Linux might use bigger pages than the 4K pages to handle those mmio
ioremaps, the kmmio code shouldn't rely on the pade id as it currently does.

Using the memory address instead of the page id lets us look up how big the
page is and what its base address is, so that we won't get a page fault
within the same page twice anymore.

Tested-by: Pierre Moreau <pierre.morrow@free.fr>
Signed-off-by: Karol Herbst <nouveau@karolherbst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Cc: linux-x86_64@vger.kernel.org
Cc: nouveau@lists.freedesktop.org
Cc: pq@iki.fi
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1456966991-6861-1-git-send-email-nouveau@karolherbst.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/kmmio.c | 88 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34ed632..ddb2244b06a1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
 struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
-	unsigned long page; /* location of the fault page */
+	unsigned long addr; /* the requested address */
 	pteval_t old_presence; /* page presence prior to arming */
 	bool armed;
 
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
-static struct list_head *kmmio_page_list(unsigned long page)
+static struct list_head *kmmio_page_list(unsigned long addr)
 {
-	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+
+	return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
 }
 
 /* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 }
 
 /* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
 {
 	struct list_head *head;
 	struct kmmio_fault_page *f;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
 
-	page &= PAGE_MASK;
-	head = kmmio_page_list(page);
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+	head = kmmio_page_list(addr);
 	list_for_each_entry_rcu(f, head, list) {
-		if (f->page == page)
+		if (f->addr == addr)
 			return f;
 	}
 	return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
 	unsigned int level;
-	pte_t *pte = lookup_address(f->page, &level);
+	pte_t *pte = lookup_address(f->addr, &level);
 
 	if (!pte) {
-		pr_err("no pte for page 0x%08lx\n", f->page);
+		pr_err("no pte for addr 0x%08lx\n", f->addr);
 		return -1;
 	}
 
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 		return -1;
 	}
 
-	__flush_tlb_one(f->page);
+	__flush_tlb_one(f->addr);
 	return 0;
 }
 
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 	int ret;
 	WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
 	if (f->armed) {
-		pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
-			   f->page, f->count, !!f->old_presence);
+		pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+			   f->addr, f->count, !!f->old_presence);
 	}
 	ret = clear_page_presence(f, true);
-	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
-		  f->page);
+	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+		  f->addr);
 	f->armed = true;
 	return ret;
 }
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
 	int ret = clear_page_presence(f, false);
 	WARN_ONCE(ret < 0,
-			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+			KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
 	f->armed = false;
 }
 
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	struct kmmio_context *ctx;
 	struct kmmio_fault_page *faultpage;
 	int ret = 0; /* default to fault not handled */
+	unsigned long page_base = addr;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+	if (!pte)
+		return -EINVAL;
+	page_base &= page_level_mask(l);
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	preempt_disable();
 	rcu_read_lock();
 
-	faultpage = get_kmmio_fault_page(addr);
+	faultpage = get_kmmio_fault_page(page_base);
 	if (!faultpage) {
 		/*
 		 * Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
-		if (addr == ctx->addr) {
+		if (page_base == ctx->addr) {
 			/*
 			 * A second fault on the same page means some other
 			 * condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	ctx->active++;
 
 	ctx->fpage = faultpage;
-	ctx->probe = get_kmmio_probe(addr);
+	ctx->probe = get_kmmio_probe(page_base);
 	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
-	ctx->addr = addr;
+	ctx->addr = page_base;
 
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
 }
 
 /* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
+static int add_kmmio_fault_page(unsigned long addr)
 {
 	struct kmmio_fault_page *f;
 
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
+	f = get_kmmio_fault_page(addr);
 	if (f) {
 		if (!f->count)
 			arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
 		return -1;
 
 	f->count = 1;
-	f->page = page;
+	f->addr = addr;
 
 	if (arm_kmmio_fault_page(f)) {
 		kfree(f);
 		return -1;
 	}
 
-	list_add_rcu(&f->list, kmmio_page_list(f->page));
+	list_add_rcu(&f->list, kmmio_page_list(f->addr));
 
 	return 0;
 }
 
 /* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
+static void release_kmmio_fault_page(unsigned long addr,
 				struct kmmio_fault_page **release_list)
 {
 	struct kmmio_fault_page *f;
 
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
+	f = get_kmmio_fault_page(addr);
 	if (!f)
 		return;
 
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	int ret = 0;
 	unsigned long size = 0;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	unsigned int l;
+	pte_t *pte;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	if (get_kmmio_probe(p->addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
+
+	pte = lookup_address(p->addr, &l);
+	if (!pte) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < size_lim) {
 		if (add_kmmio_fault_page(p->addr + size))
 			pr_err("Unable to set page fault.\n");
-		size += PAGE_SIZE;
+		size += page_level_size(l);
 	}
 out:
 	spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
+	unsigned int l;
+	pte_t *pte;
+
+	pte = lookup_address(p->addr, &l);
+	if (!pte)
+		return;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < size_lim) {
 		release_kmmio_fault_page(p->addr + size, &release_list);
-		size += PAGE_SIZE;
+		size += page_level_size(l);
 	}
 	list_del_rcu(&p->list);
 	kmmio_count--;
-- 
cgit 


From ad32a1f3c36b046b2e7b0361b88c18e3af61419e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 23 Jan 2016 19:17:59 +0000
Subject: um: use %lx format specifiers for unsigned longs

static analysis from cppcheck detected %x being used for
unsigned longs:

[arch/x86/um/os-Linux/task_size.c:112]: (warning) %x in format
  string (no. 1) requires 'unsigned int' but the argument type
  is 'unsigned long'.

Use %lx instead of %x

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/os-Linux/task_size.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c
index 8502ad30e61b..5adb6a2fd117 100644
--- a/arch/x86/um/os-Linux/task_size.c
+++ b/arch/x86/um/os-Linux/task_size.c
@@ -109,7 +109,7 @@ unsigned long os_get_top_address(void)
 		exit(1);
 	}
 
-	printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT);
+	printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT);
 	printf("Locating the top of the address space ... ");
 	fflush(stdout);
 
@@ -134,7 +134,7 @@ out:
 		exit(1);
 	}
 	top <<= UM_KERN_PAGE_SHIFT;
-	printf("0x%x\n", top);
+	printf("0x%lx\n", top);
 
 	return top;
 }
-- 
cgit 


From b24b78a113164a84bf242b08ef2165365445ea2b Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Fri, 4 Mar 2016 10:53:39 +0100
Subject: klp: remove superfluous errors in asm/livepatch.h

There is an #error in asm/livepatch.h for both x86 and s390 in
!CONFIG_LIVEPATCH cases. It does not make much sense as pointed out by
Michael Ellerman. One can happily include asm/livepatch.h with
CONFIG_LIVEPATCH. Remove it as useless.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index e795f5274217..8acfe798625b 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -40,8 +40,6 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
 	regs->ip = ip;
 }
-#else
-#error Include linux/livepatch.h, not asm/livepatch.h
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
-- 
cgit 


From 335e073faacc9d90bee1527fa2550d6df045df63 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Sun, 6 Mar 2016 22:20:25 +0100
Subject: klp: remove CONFIG_LIVEPATCH dependency from klp headers

There is no need for livepatch.h (generic and arch-specific) to depend
on CONFIG_LIVEPATCH. Remove that superfluous dependency.

Reported-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 8acfe798625b..7e68f9558552 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -25,7 +25,6 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
 {
 #ifndef CC_USING_FENTRY
@@ -40,6 +39,5 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
 	regs->ip = ip;
 }
-#endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
-- 
cgit 


From bc4b024a8b8bd7dceb2697299aad2bda57d065e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Mar 2016 10:40:02 -0600
Subject: PCI: Move pci_dma_* helpers to common code

For a long time all architectures implement the pci_dma_* functions using
the generic DMA API, and they all use the same header to do so.

Move this header, pci-dma-compat.h, to include/linux and include it from
the generic pci.h instead of having each arch duplicate this include.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/include/asm/pci.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 462594320d39..7fd8eaa8714d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -105,9 +105,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
 #include <asm/pci_64.h>
 #endif
 
-/* implement the pci_ DMA API in terms of the generic device dma_ one */
-#include <asm-generic/pci-dma-compat.h>
-
 /* generic pci stuff */
 #include <asm-generic/pci.h>
 
-- 
cgit 


From bc864af13f34d19c911f5691d87bdacc9ce109f5 Mon Sep 17 00:00:00 2001
From: Chris Bainbridge <chris.bainbridge@gmail.com>
Date: Mon, 7 Mar 2016 11:10:00 +0100
Subject: x86/microcode/intel: Change checksum variables to u32

Microcode checksum verification should be done using unsigned 32-bit
values otherwise the calculation overflow results in undefined
behaviour.

This is also nicely documented in the SDM, section "Microcode Update
Checksum":

  "To check for a corrupt microcode update, software must perform a
  unsigned DWORD (32-bit) checksum of the microcode update. Even though
  some fields are signed, the checksum procedure treats all DWORDs as
  unsigned. Microcode updates with a header version equal to 00000001H
  must sum all DWORDs that comprise the microcode update. A valid
  checksum check will yield a value of 00000000H."

but for some reason the code has been using ints from the very
beginning.

In practice, this bug possibly manifested itself only when doing the
microcode data checksum - apparently, currently shipped Intel microcode
doesn't have an extended signature table for which we do checksum
verification too.

  UBSAN: Undefined behaviour in arch/x86/kernel/cpu/microcode/intel_lib.c:105:12
  signed integer overflow:
  -1500151068 + -2125470173 cannot be represented in type 'int'
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc5+ #495
  ...
  Call Trace:
   dump_stack
   ? inotify_ioctl
   ubsan_epilogue
   handle_overflow
   __ubsan_handle_add_overflow
   microcode_sanity_check
   get_matching_model_microcode.isra.2.constprop.8
   ? early_idt_handler_common
   ? strlcpy
   ? find_cpio_data
   load_ucode_intel_bsp
   load_ucode_bsp
   ? load_ucode_bsp
   x86_64_start_kernel

[ Expand and massage commit message. ]
Signed-off-by: Chris Bainbridge <chris.bainbridge@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: hmh@hmh.eng.br
Link: http://lkml.kernel.org/r/1456834359-5132-1-git-send-email-chris.bainbridge@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/intel_lib.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index b96896bcbdaf..99ca2c935777 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err)
 	unsigned long total_size, data_size, ext_table_size;
 	struct microcode_header_intel *mc_header = mc;
 	struct extended_sigtable *ext_header = NULL;
-	int sum, orig_sum, ext_sigcount = 0, i;
+	u32 sum, orig_sum, ext_sigcount = 0, i;
 	struct extended_signature *ext_sig;
 
 	total_size = get_totalsize(mc_header);
@@ -85,8 +85,8 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	/* check extended table checksum */
 	if (ext_table_size) {
-		int ext_table_sum = 0;
-		int *ext_tablep = (int *)ext_header;
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / DWSIZE;
 		while (i--)
@@ -102,7 +102,7 @@ int microcode_sanity_check(void *mc, int print_err)
 	orig_sum = 0;
 	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
 	while (i--)
-		orig_sum += ((int *)mc)[i];
+		orig_sum += ((u32 *)mc)[i];
 	if (orig_sum) {
 		if (print_err)
 			pr_err("aborting, bad checksum\n");
-- 
cgit 


From c0414622177ae4739a49ca7dad4306a681e2878b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:01 +0100
Subject: x86/microcode/intel: Get rid of DWSIZE

sizeof(u32) is perfectly clear as it is.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-3-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/microcode_intel.h    | 1 -
 arch/x86/kernel/cpu/microcode/intel_lib.c | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 8559b0102ea1..603417f8dd6c 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -40,7 +40,6 @@ struct extended_sigtable {
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
 #define EXT_HEADER_SIZE		(sizeof(struct extended_sigtable))
 #define EXT_SIGNATURE_SIZE	(sizeof(struct extended_signature))
-#define DWSIZE			(sizeof(u32))
 
 #define get_totalsize(mc) \
 	(((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 99ca2c935777..2b2d1354dc70 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -88,7 +88,7 @@ int microcode_sanity_check(void *mc, int print_err)
 		u32 ext_table_sum = 0;
 		u32 *ext_tablep = (u32 *)ext_header;
 
-		i = ext_table_size / DWSIZE;
+		i = ext_table_size / sizeof(u32);
 		while (i--)
 			ext_table_sum += ext_tablep[i];
 		if (ext_table_sum) {
@@ -100,7 +100,7 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	/* calculate the checksum */
 	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
 	while (i--)
 		orig_sum += ((u32 *)mc)[i];
 	if (orig_sum) {
-- 
cgit 


From 7d0161569a3b66aaa01520002c3e5fd7126d071f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:02 +0100
Subject: x86/microcode/intel: Merge two consecutive if-statements

Merge the two consecutive "if (ext_table_size)". No functional change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-4-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/intel_lib.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 2b2d1354dc70..ffb1bbf45db0 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -68,6 +68,9 @@ int microcode_sanity_check(void *mc, int print_err)
 	}
 	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
 	if (ext_table_size) {
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep;
+
 		if ((ext_table_size < EXT_HEADER_SIZE)
 		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
 			if (print_err)
@@ -81,12 +84,9 @@ int microcode_sanity_check(void *mc, int print_err)
 			return -EFAULT;
 		}
 		ext_sigcount = ext_header->count;
-	}
 
-	/* check extended table checksum */
-	if (ext_table_size) {
-		u32 ext_table_sum = 0;
-		u32 *ext_tablep = (u32 *)ext_header;
+		/* check extended table checksum */
+		ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / sizeof(u32);
 		while (i--)
-- 
cgit 


From 5b46b5e003724547f0c83041cada15f9f496590d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:03 +0100
Subject: x86/microcode/intel: Improve microcode sanity-checking error messages

Turn them into proper sentences. Add comments to microcode_sanity_check() to
explain what it does.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-5-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/intel_lib.c | 36 ++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ffb1bbf45db0..23b1d92342e3 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -57,15 +57,16 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	if (data_size + MC_HEADER_SIZE > total_size) {
 		if (print_err)
-			pr_err("error! Bad data size in microcode data file\n");
+			pr_err("Error: bad microcode data file size.\n");
 		return -EINVAL;
 	}
 
 	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
 		if (print_err)
-			pr_err("error! Unknown microcode update format\n");
+			pr_err("Error: invalid/unknown microcode update format.\n");
 		return -EINVAL;
 	}
+
 	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
 	if (ext_table_size) {
 		u32 ext_table_sum = 0;
@@ -74,43 +75,58 @@ int microcode_sanity_check(void *mc, int print_err)
 		if ((ext_table_size < EXT_HEADER_SIZE)
 		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
 			if (print_err)
-				pr_err("error! Small exttable size in microcode data file\n");
+				pr_err("Error: truncated extended signature table.\n");
 			return -EINVAL;
 		}
+
 		ext_header = mc + MC_HEADER_SIZE + data_size;
 		if (ext_table_size != exttable_size(ext_header)) {
 			if (print_err)
-				pr_err("error! Bad exttable size in microcode data file\n");
+				pr_err("Error: extended signature table size mismatch.\n");
 			return -EFAULT;
 		}
+
 		ext_sigcount = ext_header->count;
 
-		/* check extended table checksum */
+		/*
+		 * Check extended table checksum: the sum of all dwords that
+		 * comprise a valid table must be 0.
+		 */
 		ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / sizeof(u32);
 		while (i--)
 			ext_table_sum += ext_tablep[i];
+
 		if (ext_table_sum) {
 			if (print_err)
-				pr_warn("aborting, bad extended signature table checksum\n");
+				pr_warn("Bad extended signature table checksum, aborting.\n");
 			return -EINVAL;
 		}
 	}
 
-	/* calculate the checksum */
+	/*
+	 * Calculate the checksum of update data and header. The checksum of
+	 * valid update data and header including the extended signature table
+	 * must be 0.
+	 */
 	orig_sum = 0;
 	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
 	while (i--)
 		orig_sum += ((u32 *)mc)[i];
+
 	if (orig_sum) {
 		if (print_err)
-			pr_err("aborting, bad checksum\n");
+			pr_err("Bad microcode data checksum, aborting.\n");
 		return -EINVAL;
 	}
+
 	if (!ext_table_size)
 		return 0;
-	/* check extended signature checksum */
+
+	/*
+	 * Check extended signature checksum: 0 => valid.
+	 */
 	for (i = 0; i < ext_sigcount; i++) {
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
 			  EXT_SIGNATURE_SIZE * i;
@@ -119,7 +135,7 @@ int microcode_sanity_check(void *mc, int print_err)
 			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
 		if (sum) {
 			if (print_err)
-				pr_err("aborting, bad checksum\n");
+				pr_err("Bad extended signature checksum, aborting.\n");
 			return -EINVAL;
 		}
 	}
-- 
cgit 


From 4ace2e7a48ab426eaa9745ace4c50c6a7adb3992 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:04 +0100
Subject: x86/microcode/intel: Drop orig_sum from ext signature checksum

It is 0 because for !0 values we would have exited already.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-6-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/intel_lib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 23b1d92342e3..2ce1a7dc45b7 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -130,9 +130,9 @@ int microcode_sanity_check(void *mc, int print_err)
 	for (i = 0; i < ext_sigcount; i++) {
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
 			  EXT_SIGNATURE_SIZE * i;
-		sum = orig_sum
-			- (mc_header->sig + mc_header->pf + mc_header->cksum)
-			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+
+		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
 		if (sum) {
 			if (print_err)
 				pr_err("Bad extended signature checksum, aborting.\n");
-- 
cgit 


From adc53f2e0ae2fcff10a4b981df14729ffb1482fc Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:17 +0100
Subject: x86/mce: Move MCx_CONFIG MSR definitions

Those MSRs are used only by the MCE code so move them there.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1456785179-14378-2-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h       | 4 ++++
 arch/x86/include/asm/msr-index.h | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527e462f..80ba0a8d6d06 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -91,6 +91,10 @@
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 /*
  * This structure contains all data related to the MCE log.  Also
  * carries a signature to make it easier to find from external
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 552346598dab..b05402ef3b84 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,10 +264,6 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
-/* 'SMCA': AMD64 Scalable MCA */
-#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
-#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
-- 
cgit 


From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:18 +0100
Subject: x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

[ Minor fixups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h           | 59 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 ++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8d6d06..9c467fe00551 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
 /* AMD-specific bits */
 #define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX		0x1
+#define MCI_IPID_MCATYPE	0xFFFF0000
+#define MCI_IPID_HWID		0xFFF
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@
 
 /* AMD Scalable MCA */
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
 
 /*
  * This structure contains all data related to the MCE log.  Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
 				      struct cper_sec_mem_err *mem_err);
 
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+	SMCA_F17H_CORE = 0,	/* Core errors */
+	SMCA_DF,		/* Data Fabric */
+	SMCA_UMC,		/* Unified Memory Controller */
+	SMCA_PB,		/* Parameter Block */
+	SMCA_PSP,		/* Platform Security Processor */
+	SMCA_SMU,		/* System Management Unit */
+	N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+	const char *name;
+	unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+	SMCA_LS = 0,	/* Load Store */
+	SMCA_IF,	/* Instruction Fetch */
+	SMCA_L2_CACHE,	/* L2 cache */
+	SMCA_DE,	/* Decoder unit */
+	RES,		/* Reserved */
+	SMCA_EX,	/* Execution unit */
+	SMCA_FP,	/* Floating Point */
+	SMCA_L3_CACHE,	/* L3 cache */
+	N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+	SMCA_CS = 0,	/* Coherent Slave */
+	SMCA_PIE,	/* Power management, Interrupts, etc */
+	N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27bd5797..ee487a93ebe7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
 	"execution_unit",
 };
 
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+	[SMCA_F17H_CORE]	= { "f17h_core",	0xB0 },
+	[SMCA_DF]		= { "data_fabric",	0x2E },
+	[SMCA_UMC]		= { "umc",		0x96 },
+	[SMCA_PB]		= { "param_block",	0x5 },
+	[SMCA_PSP]		= { "psp",		0xFF },
+	[SMCA_SMU]		= { "smu",		0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+	[SMCA_LS]		= "load_store",
+	[SMCA_IF]		= "insn_fetch",
+	[SMCA_L2_CACHE]		= "l2_cache",
+	[SMCA_DE]		= "decode_unit",
+	[RES]			= "",
+	[SMCA_EX]		= "execution_unit",
+	[SMCA_FP]		= "floating_point",
+	[SMCA_L3_CACHE]		= "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+	[SMCA_CS]		= "coherent_slave",
+	[SMCA_PIE]		= "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
-- 
cgit 


From 8dd1e17a55b0bb1206c71c7a4344c5e3037cdf65 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:19 +0100
Subject: x86/mce/AMD: Fix logic to obtain block address

In upcoming processors, the BLKPTR field is no longer used to indicate
the MSR number of the additional register. Insted, it simply indicates
the prescence of additional MSRs.

Fix the logic here to gather MSR address from MSR_AMD64_SMCA_MCx_MISC()
for newer processors and fall back to existing logic for older
processors.

[ Drop nextaddr_out label; style cleanups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-4-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h           |  4 ++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 84 +++++++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9c467fe00551..72f86882d62a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,10 +104,14 @@
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
 /* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
 
 /*
  * This structure contains all data related to the MCE log.  Also
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ee487a93ebe7..a53eb1ba587e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -304,6 +304,51 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+			     unsigned int bank, unsigned int block)
+{
+	u32 addr = 0, offset = 0;
+
+	if (mce_flags.smca) {
+		if (!block) {
+			addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+		} else {
+			/*
+			 * For SMCA enabled processors, BLKPTR field of the
+			 * first MISC register (MCx_MISC0) indicates presence of
+			 * additional MISC register set (MISC1-4).
+			 */
+			u32 low, high;
+
+			if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+				return addr;
+
+			if (!(low & MCI_CONFIG_MCAX))
+				return addr;
+
+			if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+			    (low & MASK_BLKPTR_LO))
+				addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+		}
+		return addr;
+	}
+
+	/* Fall back to method we used for older processors: */
+	switch (block) {
+	case 0:
+		addr = MSR_IA32_MCx_MISC(bank);
+		break;
+	case 1:
+		offset = ((low & MASK_BLKPTR_LO) >> 21);
+		if (offset)
+			addr = MCG_XBLK_ADDR + offset;
+		break;
+	default:
+		addr = ++current_addr;
+	}
+	return addr;
+}
+
 static int
 prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
@@ -366,16 +411,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0)
-				address = MSR_IA32_MCx_MISC(bank);
-			else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-
-				address += MCG_XBLK_ADDR;
-			} else
-				++address;
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -480,16 +518,9 @@ static void amd_threshold_interrupt(void)
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0) {
-				address = MSR_IA32_MCx_MISC(bank);
-			} else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-				address += MCG_XBLK_ADDR;
-			} else {
-				++address;
-			}
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -709,16 +740,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (err)
 		goto out_free;
 recurse:
-	if (!block) {
-		address = (low & MASK_BLKPTR_LO) >> 21;
-		if (!address)
-			return 0;
-		address += MCG_XBLK_ADDR;
-	} else {
-		++address;
-	}
+	address = get_block_address(address, low, high, bank, ++block);
+	if (!address)
+		return 0;
 
-	err = allocate_threshold_blocks(cpu, bank, ++block, address);
+	err = allocate_threshold_blocks(cpu, bank, block, address);
 	if (err)
 		goto out_free;
 
-- 
cgit 


From 2cd3b5f9033f0b051842a279dac5a54271cbd3c8 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:20 +0100
Subject: x86/mce: Clarify comments regarding deferred error

Deferred errors indicate errors that hardware could not fix. But it
still does not cause any interruption to program flow. So it does not
generate any #MC and UC bit in MCx_STATUS is not set.

Correct comment.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-5-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 72f86882d62a..cfff34172be0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
 #define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
 
-- 
cgit 


From ea2ca36b658cfc6081ee454e97593c81f646806e Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:21 +0100
Subject: x86/mce/AMD: Document some functionality

In an attempt to aid in understanding of what the threshold_block
structure holds, provide comments to describe the members here. Also,
trim comments around threshold_restart_bank() and update copyright info.

No functional change is introduced.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
[ Shorten comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-6-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/amd_nb.h        | 26 +++++++++++++++++---------
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  7 ++-----
 2 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1ae068..5e828da2e18f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
 };
 
 struct threshold_block {
-	unsigned int		block;
-	unsigned int		bank;
-	unsigned int		cpu;
-	u32			address;
-	u16			interrupt_enable;
-	bool			interrupt_capable;
-	u16			threshold_limit;
-	struct kobject		kobj;
-	struct list_head	miscj;
+	unsigned int	 block;			/* Number within bank */
+	unsigned int	 bank;			/* MCA bank the block belongs to */
+	unsigned int	 cpu;			/* CPU which controls MCA bank */
+	u32		 address;		/* MSR address for the block */
+	u16		 interrupt_enable;	/* Enable/Disable APIC interrupt */
+	bool		 interrupt_capable;	/* Bank can generate an interrupt. */
+
+	u16		 threshold_limit;	/*
+						 * Value upon which threshold
+						 * interrupt is generated.
+						 */
+
+	struct kobject	 kobj;			/* sysfs object */
+	struct list_head miscj;			/*
+						 * List of threshold blocks
+						 * within a bank.
+						 */
 };
 
 struct threshold_bank {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a53eb1ba587e..9d656fd436ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -201,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	return 1;
 };
 
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
 static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
-- 
cgit 


From e72daf3f4d764c47fb71c9bdc7f9c54a503825b1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 1 Mar 2016 20:03:52 +0100
Subject: perf/x86/intel: Use PAGE_SIZE for PEBS buffer size on Core2

Using PAGE_SIZE buffers makes the WRMSR to PERF_GLOBAL_CTRL in
intel_pmu_enable_all() mysteriously hang on Core2. As a workaround, we
don't do this.

The hard lockup is easily triggered by running 'perf test attr'
repeatedly. Most of the time it gets stuck on sample session with
small periods.

  # perf test attr -vv
  14: struct perf_event_attr setup                             :
  --- start ---
  ...
    'PERF_TEST_ATTR=/tmp/tmpuEKz3B /usr/bin/perf record -o /tmp/tmpuEKz3B/perf.data -c 123 kill >/dev/null 2>&1' ret 1

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160301190352.GA8355@krava.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c   | 13 +++++++++++--
 arch/x86/events/perf_event.h |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c8a243d6fc82..22ece02d01b6 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -269,7 +269,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -286,7 +286,7 @@ static int alloc_pebs_buffer(int cpu)
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
 
-	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
 
 	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
 	ds->pebs_index = ds->pebs_buffer_base;
@@ -1319,6 +1319,7 @@ void __init intel_ds_init(void)
 
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
 	if (x86_pmu.pebs) {
 		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
 		int format = x86_pmu.intel_cap.pebs_format;
@@ -1327,6 +1328,14 @@ void __init intel_ds_init(void)
 		case 0:
 			pr_cont("PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			/*
+			 * Using >PAGE_SIZE buffers makes the WRMSR to
+			 * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+			 * mysteriously hang on Core2.
+			 *
+			 * As a workaround, we don't do this.
+			 */
+			x86_pmu.pebs_buffer_size = PAGE_SIZE;
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			break;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7bb61e32fb29..1ab6279fed1d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -586,6 +586,7 @@ struct x86_pmu {
 			pebs_broken	:1,
 			pebs_prec_dist	:1;
 	int		pebs_record_size;
+	int		pebs_buffer_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
-- 
cgit 


From c3d266c8a9838cc141b69548bc3b1b18808ae8c4 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 3 Mar 2016 18:07:28 -0500
Subject: perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi

This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:

  sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a

Also the NMI watchdog must be enabled.

For this case, the events number is larger than counter number. So
perf has to do multiplexing.

In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().

If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable().  The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.

However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.

Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.

The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.

Here is the dump:

  Call Trace:
   <NMI>  [<ffffffff813c3a2e>] dump_stack+0x63/0x85
   [<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
   [<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
   [<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
   [<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
   [<ffffffff811f2f11>] ?  unmap_kernel_range_noflush+0x11/0x20
   [<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
   [<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
   [<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
   [<ffffffff810310b9>] nmi_handle+0x69/0x120
   [<ffffffff810316f6>] default_do_nmi+0xe6/0x100
   [<ffffffff810317f2>] do_nmi+0xe2/0x130
   [<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   <<EOE>>  <IRQ>  [<ffffffff81006df8>] ?  x86_perf_event_set_period+0xd8/0x180
   [<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
   [<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
   [<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
   [<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
   [<ffffffff8119c970>] ?  __perf_install_in_context+0xc0/0xc0
   [<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
   [<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
   [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
   [<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
   [<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
   <EOI>  [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff81123de5>] ?  smp_call_function_single+0xd5/0x130
   [<ffffffff81123ddb>] ?  smp_call_function_single+0xcb/0x130
   [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff8119765a>] event_function_call+0x10a/0x120
   [<ffffffff8119c660>] ? ctx_resched+0x90/0x90
   [<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
   [<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
   [<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
   [<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
   [<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
   [<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
   [<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
   [<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
   [<ffffffff81036d29>] ? sched_clock+0x9/0x10
   [<ffffffff8124a919>] SyS_ioctl+0x79/0x90
   [<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
  ---[ end trace aef202839fe9a71d ]---
  Uhhuh. NMI received for unknown reason 2d on CPU 2.
  Do you have a strange power saving mode enabled?

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c       | 13 +++++++++++++
 arch/x86/events/intel/core.c | 15 +++++++++++++--
 arch/x86/events/intel/knc.c  |  4 +++-
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7402c8182813..5e830d0c95c9 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -597,6 +597,19 @@ void x86_pmu_disable_all(void)
 	}
 }
 
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
 static void x86_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a7ec685657a5..b3f6349a33b5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs
 };
 
 /*
- * Use from PMIs where the LBRs are already disabled.
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
  */
 static void __intel_pmu_disable_all(void)
 {
@@ -1929,7 +1937,10 @@ again:
 		goto again;
 
 done:
-	__intel_pmu_enable_all(0, true);
+	/* Only restore PMU state when it's active. See x86_pmu_disable(). */
+	if (cpuc->enabled)
+		__intel_pmu_enable_all(0, true);
+
 	/*
 	 * Only unmask the NMI after the overflow counters
 	 * have been reset. This avoids spurious NMIs on
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
index 206226e08f49..548d5f774b07 100644
--- a/arch/x86/events/intel/knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -263,7 +263,9 @@ again:
 		goto again;
 
 done:
-	knc_pmu_enable_all(0);
+	/* Only restore PMU state when it's active. See x86_pmu_disable(). */
+	if (cpuc->enabled)
+		knc_pmu_enable_all(0);
 
 	return handled;
 }
-- 
cgit 


From 5690ae28e472d25e330ad0c637a5cea3fc39fb32 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:40 +0100
Subject: perf/x86/intel: Add definition for PT PMI bit

This patch adds a definition for GLOBAL_OVFL_STATUS bit 55
which is used with the Processor Trace (PT) feature.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7bcb861a04e5..5a2ed3ed2f26 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
 #define GLOBAL_STATUS_ASIF				BIT_ULL(60)
 #define GLOBAL_STATUS_COUNTERS_FROZEN			BIT_ULL(59)
 #define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI			BIT_ULL(55)
 
 /*
  * IBS cpuid feature detection
-- 
cgit 


From 8077eca079a212f26419c57226f28696b7100683 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:41 +0100
Subject: perf/x86/pebs: Add workaround for broken OVFL status on HSW+

This patch fixes an issue with the GLOBAL_OVERFLOW_STATUS bits on
Haswell, Broadwell and Skylake processors when using PEBS.

The SDM stipulates that when the PEBS iterrupt threshold is crossed,
an interrupt is posted and the kernel is interrupted. The kernel will
find GLOBAL_OVF_SATUS bit 62 set indicating there are PEBS records to
drain. But the bits corresponding to the actual counters should NOT be
set. The kernel follows the SDM and assumes that all PEBS events are
processed in the drain_pebs() callback. The kernel then checks for
remaining overflows on any other (non-PEBS) events and processes these
in the for_each_bit_set(&status) loop.

As it turns out, under certain conditions on HSW and later processors,
on PEBS buffer interrupt, bit 62 is set but the counter bits may be
set as well. In that case, the kernel drains PEBS and generates
SAMPLES with the EXACT tag, then it processes the counter bits, and
generates normal (non-EXACT) SAMPLES.

I ran into this problem by trying to understand why on HSW sampling on
a PEBS event was sometimes returning SAMPLES without the EXACT tag.
This should not happen on user level code because HSW has the
eventing_ip which always point to the instruction that caused the
event.

The workaround in this patch simply ensures that the bits for the
counters used for PEBS events are cleared after the PEBS buffer has
been drained. With this fix 100% of the PEBS samples on my user code
report the EXACT tag.

Before:
  $ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
  $ perf report -D | fgrep SAMPLES
  PERF_RECORD_SAMPLE(IP, 0x2): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
                           \--- EXACT tag is missing

After:
  $ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
  $ perf report -D | fgrep SAMPLES
  PERF_RECORD_SAMPLE(IP, 0x4002): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
                           \--- EXACT tag is set

The problem tends to appear more often when multiple PEBS events are used.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index b3f6349a33b5..6567c6234253 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1892,6 +1892,16 @@ again:
 	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
 		handled++;
 		x86_pmu.drain_pebs(regs);
+		/*
+		 * There are cases where, even though, the PEBS ovfl bit is set
+		 * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+		 * overflow bits set for their counters. We must clear them
+		 * here because they have been processed as exact samples in
+		 * the drain_pebs() routine. They must not be processed again
+		 * in the for_each_bit_set() loop for regular samples below.
+		 */
+		status &= ~cpuc->pebs_enabled;
+		status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 	}
 
 	/*
-- 
cgit 


From b3e6246336a4a329644418a1c66e2c6bed44ef81 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:42 +0100
Subject: perf/x86/pebs: Add proper PEBS constraints for Broadwell

This patch adds a Broadwell specific PEBS event constraint table.

Broadwell has a fix for the HT corruption bug erratum HSD29 on
Haswell. Therefore, there is no need to mark events 0xd0, 0xd1, 0xd2,
0xd3 has requiring the exclusive mode across both sibling HT threads.
This holds true for regular counting and sampling (see core.c) and
PEBS (ds.c) which we fix in this patch.

In doing so, we relax evnt scheduling for these events, they can now
be programmed on any 4 counters without impacting what is measured on
the sibling thread.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@redhat.com
Cc: adrian.hunter@intel.com
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/intel/ds.c   | 24 ++++++++++++++++++++++++
 arch/x86/events/perf_event.h |  2 ++
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6567c6234253..edac81c4295a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3602,7 +3602,7 @@ __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
 		x86_pmu.pebs_prec_dist = true;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 22ece02d01b6..a99a8cbac729 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -722,6 +722,30 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+
 struct event_constraint intel_skl_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
 	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1ab6279fed1d..24e259eb0b3a 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
 
 extern struct event_constraint intel_hsw_pebs_event_constraints[];
 
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
 extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-- 
cgit 


From e17dc65328057c00db7e1bfea249c8771a78b30b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Mar 2016 14:25:24 -0800
Subject: perf/x86/intel: Fix PEBS data source interpretation on
 Nehalem/Westmere

Jiri reported some time ago that some entries in the PEBS data source table
in perf do not agree with the SDM. We investigated and the bits
changed for Sandy Bridge, but the SDM was not updated.

perf already implements the bits correctly for Sandy Bridge
and later. This patch patches it up for Nehalem and Westmere.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/1456871124-15985-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c |  2 ++
 arch/x86/events/intel/ds.c   | 11 ++++++++++-
 arch/x86/events/perf_event.h |  2 ++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index edac81c4295a..68fa55b4d42e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3417,6 +3417,7 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
+		intel_pmu_pebs_data_source_nhm();
 		x86_add_quirk(intel_nehalem_quirk);
 
 		pr_cont("Nehalem events, ");
@@ -3480,6 +3481,7 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
+		intel_pmu_pebs_data_source_nhm();
 		pr_cont("Westmere events, ");
 		break;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a99a8cbac729..ce7211a07c0b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -51,7 +51,8 @@ union intel_x86_pebs_dse {
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
-static const u64 pebs_data_source[] = {
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
 	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
 	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
 	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
@@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = {
 	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+}
+
 static u64 precise_store_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 24e259eb0b3a..68155cafa8a1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -907,6 +907,8 @@ void intel_pmu_lbr_init_skl(void);
 
 void intel_pmu_lbr_init_knl(void);
 
+void intel_pmu_pebs_data_source_nhm(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
-- 
cgit 


From 1a8aa8acab4f3949e05ceb51e36f627b1651814c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 6 Mar 2016 19:11:15 +0100
Subject: x86/apic: Deinline __default_send_IPI_*, save ~200 bytes

__default_send_IPI_shortcut: 49 bytes, 2 callsites
__default_send_IPI_dest_field: 108 bytes, 7 callsites

     text     data      bss       dec     hex filename
 96184086 20860488 36122624 153167198 921255e vmlinux_before
 96183823 20860520 36122624 153166967 9212477 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Borislav Petkov <bp@alien.de>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1457287876-6001-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ipi.h | 58 ++------------------------------------------
 arch/x86/kernel/apic/ipi.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index cfc9a0d2d07c..a4fe16e42b7b 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void)
 		cpu_relax();
 }
 
-static inline void
-__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
-{
-	/*
-	 * Subtle. In the case of the 'never do double writes' workaround
-	 * we have to lock out interrupts to be safe.  As we don't care
-	 * of the value read we use an atomic rmw access to avoid costly
-	 * cli/sti.  Otherwise we use an even cheaper single atomic write
-	 * to the APIC.
-	 */
-	unsigned int cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	__xapic_wait_icr_idle();
-
-	/*
-	 * No need to touch the target chip field
-	 */
-	cfg = __prepare_ICR(shortcut, vector, dest);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
 
 /*
  * This is used to send an IPI with no shorthand notation (the destination is
  * specified in bits 56 to 63 of the ICR).
  */
-static inline void
- __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
-{
-	unsigned long cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	if (unlikely(vector == NMI_VECTOR))
-		safe_apic_wait_icr_idle();
-	else
-		__xapic_wait_icr_idle();
-
-	/*
-	 * prepare target chip field
-	 */
-	cfg = __prepare_ICR2(mask);
-	native_apic_mem_write(APIC_ICR2, cfg);
-
-	/*
-	 * program the ICR
-	 */
-	cfg = __prepare_ICR(0, vector, dest);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
 
 extern void default_send_IPI_single(int cpu, int vector);
 extern void default_send_IPI_single_phys(int cpu, int vector);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index eb45fc9b6124..28bde88b0085 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,66 @@
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+	/*
+	 * Subtle. In the case of the 'never do double writes' workaround
+	 * we have to lock out interrupts to be safe.  As we don't care
+	 * of the value read we use an atomic rmw access to avoid costly
+	 * cli/sti.  Otherwise we use an even cheaper single atomic write
+	 * to the APIC.
+	 */
+	unsigned int cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	__xapic_wait_icr_idle();
+
+	/*
+	 * No need to touch the target chip field
+	 */
+	cfg = __prepare_ICR(shortcut, vector, dest);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+	unsigned long cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		__xapic_wait_icr_idle();
+
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	native_apic_mem_write(APIC_ICR2, cfg);
+
+	/*
+	 * program the ICR
+	 */
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	native_apic_mem_write(APIC_ICR, cfg);
+}
+
 void default_send_IPI_single_phys(int cpu, int vector)
 {
 	unsigned long flags;
-- 
cgit 


From fe2f95468e4bdf4a526be4f86efaefe48ca63b83 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 6 Mar 2016 19:11:16 +0100
Subject: x86/apic: Deinline _flat_send_IPI_mask, save ~150 bytes

_flat_send_IPI_mask: 157 bytes, 3 callsites

     text     data      bss       dec     hex filename
 96183823 20860520 36122624 153166967 9212477 vmlinux1_before
 96183699 20860520 36122624 153166843 92123fb vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Borislav Petkov <bp@alien.de>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1457287876-6001-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic_flat_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 9968f30cca3e..76f89e2b245a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ void flat_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
 	unsigned long flags;
 
-- 
cgit 


From b8c67b7a086c9b2f96a2cf95624a9000a6be2922 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 11:21:55 +0100
Subject: KVM: MMU: introduce kvm_mmu_flush_or_zap

This is a generalization of mmu_pte_write_flush_tlb, that also
takes care of calling kvm_mmu_commit_zap_page.  The next
patches will introduce more uses.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0a4dc9b54181..6dae2356b9f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4188,11 +4188,14 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-				    bool remote_flush, bool local_flush)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+				 struct list_head *invalid_list,
+				 bool remote_flush, bool local_flush)
 {
-	if (zap_page)
+	if (!list_empty(invalid_list)) {
+		kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
 		return;
+	}
 
 	if (remote_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -4320,7 +4323,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	int npte;
-	bool remote_flush, local_flush, zap_page;
+	bool remote_flush, local_flush;
 	union kvm_mmu_page_role mask = { };
 
 	mask.cr0_wp = 1;
@@ -4337,7 +4340,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 		return;
 
-	zap_page = remote_flush = local_flush = false;
+	remote_flush = local_flush = false;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -4357,8 +4360,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
 		if (detect_write_misaligned(sp, gpa, bytes) ||
 		      detect_write_flooding(sp)) {
-			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						     &invalid_list);
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -4380,8 +4382,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			++spte;
 		}
 	}
-	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
-- 
cgit 


From 35a70510ee13b0ab130968ed399eeeb5106bc12e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 10:03:27 +0100
Subject: KVM: MMU: move TLB flush out of __kvm_sync_page

By doing this, kvm_sync_pages can use __kvm_sync_page instead of
reinventing it.  Because of kvm_mmu_flush_or_zap, the code does not
end up being more complex than before, and more cleanups to kvm_sync_pages
will come in the next patches.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 53 ++++++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6dae2356b9f5..45a8a0605a09 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1932,10 +1932,24 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return 1;
 	}
 
-	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	return 0;
 }
 
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+				 struct list_head *invalid_list,
+				 bool remote_flush, bool local_flush)
+{
+	if (!list_empty(invalid_list)) {
+		kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+		return;
+	}
+
+	if (remote_flush)
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	else if (local_flush)
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+}
+
 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu_page *sp)
 {
@@ -1943,8 +1957,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	int ret;
 
 	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-	if (ret)
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, !ret);
 
 	return ret;
 }
@@ -1975,17 +1988,11 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		kvm_unlink_unsync_page(vcpu->kvm, s);
-		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-			(vcpu->arch.mmu.sync_page(vcpu, s))) {
-			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
-			continue;
-		}
-		flush = true;
+		if (!__kvm_sync_page(vcpu, s, &invalid_list, false))
+			flush = true;
 	}
 
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	if (flush)
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 }
 
 struct mmu_page_path {
@@ -2071,6 +2078,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 
 	while (mmu_unsync_walk(parent, &pages)) {
 		bool protected = false;
+		bool flush = false;
 
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu, sp->gfn);
@@ -2079,10 +2087,12 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_sync_page(vcpu, sp, &invalid_list);
+			if (!kvm_sync_page(vcpu, sp, &invalid_list))
+				flush = true;
+
 			mmu_pages_clear_parents(&parents);
 		}
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+		kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 	}
 }
@@ -4188,21 +4198,6 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
-				 struct list_head *invalid_list,
-				 bool remote_flush, bool local_flush)
-{
-	if (!list_empty(invalid_list)) {
-		kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
-		return;
-	}
-
-	if (remote_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	else if (local_flush)
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 				    const u8 *new, int *bytes)
 {
-- 
cgit 


From df748f864a342375aaa52a7e043fae7142376d01 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 10:19:30 +0100
Subject: KVM: MMU: use kvm_sync_page in kvm_sync_pages

If the last argument is true, kvm_unlink_unsync_page is called anyway in
__kvm_sync_page (either by kvm_mmu_prepare_zap_page or by __kvm_sync_page
itself).  Therefore, kvm_sync_pages can just call kvm_sync_page, instead
of going through kvm_unlink_unsync_page+__kvm_sync_page.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45a8a0605a09..56be33714036 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1987,8 +1987,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		kvm_unlink_unsync_page(vcpu->kvm, s);
-		if (!__kvm_sync_page(vcpu, s, &invalid_list, false))
+		if (!kvm_sync_page(vcpu, s, &invalid_list))
 			flush = true;
 	}
 
-- 
cgit 


From 9a43c5d9c3f13a2fc3864570e33438347319b584 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 10:28:01 +0100
Subject: KVM: MMU: cleanup __kvm_sync_page and its callers

Calling kvm_unlink_unsync_page in the middle of __kvm_sync_page makes
things unnecessarily tricky.  If kvm_mmu_prepare_zap_page is called,
it will call kvm_unlink_unsync_page too.  So kvm_unlink_unsync_page can
be called just as well at the beginning or the end of __kvm_sync_page...
which means that we might do it in kvm_sync_page too and remove the
parameter.

kvm_sync_page ends up being the same code that kvm_sync_pages used
to have before the previous patch.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 56be33714036..88a1a79c869e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1917,16 +1917,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			   struct list_head *invalid_list, bool clear_unsync)
+			   struct list_head *invalid_list)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
 
-	if (clear_unsync)
-		kvm_unlink_unsync_page(vcpu->kvm, sp);
-
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
@@ -1956,7 +1953,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	LIST_HEAD(invalid_list);
 	int ret;
 
-	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+	ret = __kvm_sync_page(vcpu, sp, &invalid_list);
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, !ret);
 
 	return ret;
@@ -1972,7 +1969,8 @@ static void mmu_audit_disable(void) { }
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
-	return __kvm_sync_page(vcpu, sp, invalid_list, true);
+	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	return __kvm_sync_page(vcpu, sp, invalid_list);
 }
 
 /* @gfn should be write-protected at the call site */
-- 
cgit 


From 1f50f1b3a4630966fb3cd1f56892acb03580bd37 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 11:07:14 +0100
Subject: KVM: MMU: invert return value of mmu.sync_page and *kvm_sync_page*

Return true if the page was synced (and the TLB must be flushed)
and false if the page was zapped.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 31 ++++++++++++++-----------------
 arch/x86/kvm/paging_tmpl.h |  4 ++--
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 88a1a79c869e..1c87102efb3d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1784,7 +1784,7 @@ static void mark_unsync(u64 *spte)
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 			       struct kvm_mmu_page *sp)
 {
-	return 1;
+	return 0;
 }
 
 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1916,20 +1916,20 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		if ((_sp)->role.direct || (_sp)->role.invalid) {} else
 
 /* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			   struct list_head *invalid_list)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    struct list_head *invalid_list)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return 1;
+		return false;
 	}
 
-	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+	if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return 1;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 
 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
@@ -1947,14 +1947,14 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-				   struct kvm_mmu_page *sp)
+static bool kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp)
 {
 	LIST_HEAD(invalid_list);
 	int ret;
 
 	ret = __kvm_sync_page(vcpu, sp, &invalid_list);
-	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, !ret);
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, ret);
 
 	return ret;
 }
@@ -1966,7 +1966,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1985,8 +1985,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		if (!kvm_sync_page(vcpu, s, &invalid_list))
-			flush = true;
+		flush |= kvm_sync_page(vcpu, s, &invalid_list);
 	}
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -2084,9 +2083,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
-			if (!kvm_sync_page(vcpu, sp, &invalid_list))
-				flush = true;
-
+			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
 		kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -2145,7 +2142,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (sp->role.word != role.word)
 			continue;
 
-		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+		if (sp->unsync && !kvm_sync_page_transient(vcpu, sp))
 			break;
 
 		if (sp->unsync_children)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4174cf290fa3..a1f5459edcec 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -943,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
 					       sizeof(pt_element_t)))
-			return -EINVAL;
+			return 0;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
@@ -975,7 +975,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			 host_writable);
 	}
 
-	return !nr_present;
+	return nr_present;
 }
 
 #undef pt_element_t
-- 
cgit 


From 2a74003ae818cfaf129ed4df89499a1c2b796129 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Feb 2016 11:26:10 +0100
Subject: KVM: MMU: move zap/flush to kvm_mmu_get_page

kvm_mmu_get_page is the only caller of kvm_sync_page_transient
and kvm_sync_pages.  Moving the handling of the invalid_list there
removes the need for the underdocumented kvm_sync_page_transient
function.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1c87102efb3d..fecc9c51d924 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1947,18 +1947,6 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
-static bool kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp)
-{
-	LIST_HEAD(invalid_list);
-	int ret;
-
-	ret = __kvm_sync_page(vcpu, sp, &invalid_list);
-	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, ret);
-
-	return ret;
-}
-
 #ifdef CONFIG_KVM_MMU_AUDIT
 #include "mmu_audit.c"
 #else
@@ -1974,21 +1962,21 @@ static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 }
 
 /* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+			   struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *s;
-	LIST_HEAD(invalid_list);
-	bool flush = false;
+	bool ret = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
 		if (!s->unsync)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		flush |= kvm_sync_page(vcpu, s, &invalid_list);
+		ret |= kvm_sync_page(vcpu, s, invalid_list);
 	}
 
-	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+	return ret;
 }
 
 struct mmu_page_path {
@@ -2119,6 +2107,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned quadrant;
 	struct kvm_mmu_page *sp;
 	bool need_sync = false;
+	bool flush = false;
+	LIST_HEAD(invalid_list);
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
@@ -2142,8 +2132,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (sp->role.word != role.word)
 			continue;
 
-		if (sp->unsync && !kvm_sync_page_transient(vcpu, sp))
-			break;
+		if (sp->unsync) {
+			/* The page is good, but __kvm_sync_page might still end
+			 * up zapping it.  If so, break in order to rebuild it.
+			 */
+			if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+				break;
+
+			WARN_ON(!list_empty(&invalid_list));
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
 
 		if (sp->unsync_children)
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -2173,11 +2171,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
-			kvm_sync_pages(vcpu, gfn);
+			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	clear_page(sp->spt);
 	trace_kvm_mmu_get_page(sp, true);
+
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 	return sp;
 }
 
-- 
cgit 


From 50c9e6f3a69dfa458ecb671bcbd11e2eea6db0c1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 25 Feb 2016 10:47:38 +0100
Subject: KVM: MMU: coalesce more page zapping in mmu_sync_children

mmu_sync_children can only process up to 16 pages at a time.  Check
if we need to reschedule, and do not bother zapping the pages until
that happens.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fecc9c51d924..754d2c4f6f99 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2059,24 +2059,31 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
 	LIST_HEAD(invalid_list);
+	bool flush = false;
 
 	while (mmu_unsync_walk(parent, &pages)) {
 		bool protected = false;
-		bool flush = false;
 
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu, sp->gfn);
 
-		if (protected)
+		if (protected) {
 			kvm_flush_remote_tlbs(vcpu->kvm);
+			flush = false;
+		}
 
 		for_each_sp(pages, sp, parents, i) {
 			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
-		kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
-		cond_resched_lock(&vcpu->kvm->mmu_lock);
+		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+			cond_resched_lock(&vcpu->kvm->mmu_lock);
+			flush = false;
+		}
 	}
+
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 }
 
 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
-- 
cgit 


From 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Feb 2016 12:51:19 +0100
Subject: KVM: MMU: simplify last_pte_bitmap

Branch-free code is fun and everybody knows how much Avi loves it,
but last_pte_bitmap takes it a bit to the extreme.  Since the code
is simply doing a range check, like

	(level == 1 ||
	 ((gpte & PT_PAGE_SIZE_MASK) && level < N)

we can make it branch-free without storing the entire truth table;
it is enough to cache N.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 ++-----
 arch/x86/kvm/mmu.c              | 50 +++++++++++++++++++++--------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1c3e390993a2..d110dc44d6c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -347,12 +347,8 @@ struct kvm_mmu {
 
 	struct rsvd_bits_validate guest_rsvd_check;
 
-	/*
-	 * Bitmap: bit set = last pte in walk
-	 * index[0:1]: level (zero-based)
-	 * index[2]: pte.ps
-	 */
-	u8 last_pte_bitmap;
+	/* Can have large pages at levels 2..last_nonleaf_level-1. */
+	u8 last_nonleaf_level;
 
 	bool nx;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 754d2c4f6f99..2463de0b935c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3632,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 	return false;
 }
 
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+				unsigned level, unsigned gpte)
 {
-	unsigned index;
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
 
-	index = level - 1;
-	index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
-	return mmu->last_pte_bitmap & (1 << index);
+	/*
+	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+	 * If it is clear, there are no large pages at this level, so clear
+	 * PT_PAGE_SIZE_MASK in gpte if that is the case.
+	 */
+	gpte &= level - mmu->last_nonleaf_level;
+
+	return gpte & PT_PAGE_SIZE_MASK;
 }
 
 #define PTTYPE_EPT 18 /* arbitrary */
@@ -3910,22 +3921,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 	}
 }
 
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 {
-	u8 map;
-	unsigned level, root_level = mmu->root_level;
-	const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
-
-	if (root_level == PT32E_ROOT_LEVEL)
-		--root_level;
-	/* PT_PAGE_TABLE_LEVEL always terminates */
-	map = 1 | (1 << ps_set_index);
-	for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
-		if (level <= PT_PDPE_LEVEL
-		    && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
-			map |= 1 << (ps_set_index | (level - 1));
-	}
-	mmu->last_pte_bitmap = map;
+	unsigned root_level = mmu->root_level;
+
+	mmu->last_nonleaf_level = root_level;
+	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+		mmu->last_nonleaf_level++;
 }
 
 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3937,7 +3939,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_last_nonleaf_level(vcpu, context);
 
 	MMU_WARN_ON(!is_pae(vcpu));
 	context->page_fault = paging64_page_fault;
@@ -3964,7 +3966,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_last_nonleaf_level(vcpu, context);
 
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -4022,7 +4024,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_last_nonleaf_level(vcpu, context);
 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -4128,7 +4130,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, g_context, false);
-	update_last_pte_bitmap(vcpu, g_context);
+	update_last_nonleaf_level(vcpu, g_context);
 }
 
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
-- 
cgit 


From bb9eadf0c35f2e7eb5ca6468f46ebb7473b85537 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 23 Feb 2016 14:19:20 +0100
Subject: KVM: MMU: micro-optimize gpte_access

Avoid AND-NOT, most x86 processor lack an instruction for it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a1f5459edcec..6013f3685ef4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
 		ACC_USER_MASK;
 #else
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-	access &= ~(gpte >> PT64_NX_SHIFT);
+	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+	access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
+	access ^= (gpte >> PT64_NX_SHIFT);
 #endif
 
 	return access;
-- 
cgit 


From 7099e2e1f4d9051f31bbfa5803adf954bb5d76ef Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 4 Mar 2016 15:08:42 +0100
Subject: KVM: VMX: disable PEBS before a guest entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux guests on Haswell (and also SandyBridge and Broadwell, at least)
would crash if you decided to run a host command that uses PEBS, like
  perf record -e 'cpu/mem-stores/pp' -a

This happens because KVM is using VMX MSR switching to disable PEBS, but
SDM [2015-12] 18.4.4.4 Re-configuring PEBS Facilities explains why it
isn't safe:
  When software needs to reconfigure PEBS facilities, it should allow a
  quiescent period between stopping the prior event counting and setting
  up a new PEBS event. The quiescent period is to allow any latent
  residual PEBS records to complete its capture at their previously
  specified buffer address (provided by IA32_DS_AREA).

There might not be a quiescent period after the MSR switch, so a CPU
ends up using host's MSR_IA32_DS_AREA to access an area in guest's
memory.  (Or MSR switching is just buggy on some models.)

The guest can learn something about the host this way:
If the guest doesn't map address pointed by MSR_IA32_DS_AREA, it results
in #PF where we leak host's MSR_IA32_DS_AREA through CR2.

After that, a malicious guest can map and configure memory where
MSR_IA32_DS_AREA is pointing and can therefore get an output from
host's tracing.

This is not a critical leak as the host must initiate with PEBS tracing
and I have not been able to get a record from more than one instruction
before vmentry in vmx_vcpu_run() (that place has most registers already
overwritten with guest's).

We could disable PEBS just few instructions before vmentry, but
disabling it earlier shouldn't affect host tracing too much.
We also don't need to switch MSR_IA32_PEBS_ENABLE on VMENTRY, but that
optimization isn't worth its code, IMO.

(If you are implementing PEBS for guests, be sure to handle the case
 where both host and guest enable PEBS, because this patch doesn't.)

Fixes: 26a4f3c08de4 ("perf/x86: disable PEBS on a guest entry.")
Cc: <stable@vger.kernel.org>
Reported-by: Jiří Olša <jolsa@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0ff453749a90..6e51493ff4f9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1813,6 +1813,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 			return;
 		}
 		break;
+	case MSR_IA32_PEBS_ENABLE:
+		/* PEBS needs a quiescent period after being disabled (to write
+		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
+		 * provide that period, so a CPU could write host's record into
+		 * guest's memory.
+		 */
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 	}
 
 	for (i = 0; i < m->nr; ++i)
-- 
cgit 


From 8e2a7f5b9a8c49f1f4e1dc8972198510f43c0b2e Mon Sep 17 00:00:00 2001
From: Kostenzer Felix <fkostenzer@live.at>
Date: Sun, 6 Mar 2016 23:20:06 +0100
Subject: x86/nmi: Mark 'ignore_nmis' as __read_mostly

ignore_nmis is used in two distinct places:

 1. modified through {stop,restart}_nmi by alternative_instructions
 2. read by do_nmi to determine if default_do_nmi should be called or not

thus the access pattern conforms to __read_mostly and do_nmi() is a fastpath.

Signed-off-by: Kostenzer Felix <fkostenzer@live.at>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8a2cdd736fa4..04b132a767f1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -30,6 +30,7 @@
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/cache.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
 
 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
 
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
 
 int unknown_nmi_panic;
 /*
-- 
cgit 


From 58a5aac5331388a175a42b6ed2154f0559cefb21 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 29 Feb 2016 15:50:19 -0800
Subject: x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of
 paravirt_enabled

x86_64 has very clean espfix handling on paravirt: espfix64 is set
up in native_iret, so paravirt systems that override iret bypass
espfix64 automatically.  This is robust and straightforward.

x86_32 is messier.  espfix is set up before the IRET paravirt patch
point, so it can't be directly conditionalized on whether we use
native_iret.  We also can't easily move it into native_iret without
regressing performance due to a bizarre consideration.  Specifically,
on 64-bit kernels, the logic is:

  if (regs->ss & 0x4)
          setup_espfix;

On 32-bit kernels, the logic is:

  if ((regs->ss & 0x4) && (regs->cs & 0x3) == 3 &&
      (regs->flags & X86_EFLAGS_VM) == 0)
          setup_espfix;

The performance of setup_espfix itself is essentially irrelevant, but
the comparison happens on every IRET so its performance matters.  On
x86_64, there's no need for any registers except flags to implement
the comparison, so we fold the whole thing into native_iret.  On
x86_32, we don't do that because we need a free register to
implement the comparison efficiently.  We therefore do espfix setup
before restoring registers on x86_32.

This patch gets rid of the explicit paravirt_enabled check by
introducing X86_BUG_ESPFIX on 32-bit systems and using an ALTERNATIVE
to skip espfix on paravirt systems where iret != native_iret.  This is
also messy, but it's at least in line with other things we do.

This improves espfix performance by removing a branch, but no one
cares.  More importantly, it removes a paravirt_enabled user, which is
good because paravirt_enabled is ill-defined and is going away.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xensource.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S          | 15 ++-------------
 arch/x86/include/asm/cpufeatures.h |  8 ++++++++
 arch/x86/kernel/cpu/common.c       | 25 +++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 66350e6a6ca5..e13027247c90 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -361,6 +361,8 @@ restore_all:
 	TRACE_IRQS_IRET
 restore_all_notrace:
 #ifdef CONFIG_X86_ESPFIX32
+	ALTERNATIVE	"jmp restore_nocheck", "", X86_BUG_ESPFIX
+
 	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
 	/*
 	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +389,6 @@ ENTRY(iret_exc	)
 
 #ifdef CONFIG_X86_ESPFIX32
 ldt_ss:
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * The kernel can't run on a non-flat stack if paravirt mode
-	 * is active.  Rather than try to fixup the high bits of
-	 * ESP, bypass this code entirely.  This may break DOSemu
-	 * and/or Wine support in a paravirt VM, although the option
-	 * is still available to implement the setting of the high
-	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
-	 */
-	cmpl	$0, pv_info+PARAVIRT_enabled
-	jne	restore_nocheck
-#endif
-
 /*
  * Setup and switch to ESPFIX stack
  *
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6663fae71b12..d11a3aaafd96 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -286,4 +286,12 @@
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX		X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 079d83fc6488..d8337f34b5f4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -802,6 +802,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
 	clear_cpu_cap(c, X86_FEATURE_NOPL);
 #else
 	set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
+
+	/*
+	 * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
+	 * systems that run Linux at CPL > 0 may or may not have the
+	 * issue, but, even if they have the issue, there's absolutely
+	 * nothing we can do about it because we can't use the real IRET
+	 * instruction.
+	 *
+	 * NB: For the time being, only 32-bit kernels support
+	 * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
+	 * whether to apply espfix using paravirt hooks.  If any
+	 * non-paravirt system ever shows up that does *not* have the
+	 * ESPFIX issue, we can change this.
+	 */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+	do {
+		extern void native_iret(void);
+		if (pv_cpu_ops.iret == native_iret)
+			set_cpu_bug(c, X86_BUG_ESPFIX);
+	} while (0);
+#else
+	set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
 #endif
 }
 
-- 
cgit 


From 0dd0036f6e07f741a1356b424b84a3164b6e59cf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 29 Feb 2016 15:50:20 -0800
Subject: x86/asm-offsets: Remove PARAVIRT_enabled

It no longer has any users.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xensource.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/asm-offsets.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 84a7524b202c..5c042466f274 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -59,7 +59,6 @@ void common(void) {
 
 #ifdef CONFIG_PARAVIRT
 	BLANK();
-	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
 	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
 	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-- 
cgit 


From 7a8698058ae493ae53b1a8a2fa23d2e37000d73e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 8 Mar 2016 17:40:41 +0100
Subject: perf/x86/intel/rapl: Simplify quirk handling even more

Drop the quirk() function pointer in favor of a simple boolean which
says whether the quirk should be applied or not. Update comment while at
it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-tip-commits@vger.kernel.org
Link: http://lkml.kernel.org/r/20160308164041.GF16568@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 019e541fa988..b834a3f55a01 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -592,18 +592,7 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_check_hw_unit(void (*quirk)(void))
+static int rapl_check_hw_unit(bool apply_quirk)
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
@@ -614,9 +603,14 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
-	/* Apply cpu model quirk */
-	if (quirk)
-		quirk();
+	/*
+	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+	 * different than the unit from power unit MSR. See
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	if (apply_quirk)
+		rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
 
 	/*
 	 * Calculate the timer rate:
@@ -704,7 +698,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 
 static int __init rapl_pmu_init(void)
 {
-	void (*quirk)(void) = NULL;
+	bool apply_quirk = false;
 	int ret;
 
 	if (!x86_match_cpu(rapl_cpu_match))
@@ -717,7 +711,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
 	case 63: /* Haswell-Server */
-		quirk = rapl_hsw_server_quirk;
+		apply_quirk = true;
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
@@ -733,7 +727,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
 	case 87: /* Knights Landing */
-		quirk = rapl_hsw_server_quirk;
+		apply_quirk = true;
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
@@ -741,7 +735,7 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 	}
 
-	ret = rapl_check_hw_unit(quirk);
+	ret = rapl_check_hw_unit(apply_quirk);
 	if (ret)
 		return ret;
 
-- 
cgit 


From 92b0729c34cab1f46d89aace3e66015f0bb4a682 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 18 Feb 2016 11:47:26 -0800
Subject: x86/mm, x86/mce: Add memcpy_mcsafe()

Make use of the EXTABLE_FAULT exception table entries to write
a kernel copy routine that doesn't crash the system if it
encounters a machine check. Prime use case for this is to copy
from large arrays of non-volatile memory used as storage.

We have to use an unrolled copy loop for now because current
hardware implementations treat a machine check in "rep mov"
as fatal. When that is fixed we can simplify.

Return type is a "bool". True means that we copied OK, false means
that it didn't.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@gmail.com>
Link: http://lkml.kernel.org/r/a44e1055efc2d2a9473307b22c91caa437aa3f8b.1456439214.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/string_64.h |  13 +++++
 arch/x86/kernel/x8664_ksyms_64.c |   2 +
 arch/x86/lib/memcpy_64.S         | 117 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a17dc4b..ca6ba3607705 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:	destination address
+ * @src:	source address
+ * @cnt:	number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be19864..cd05942bc918 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bba87de..7d37641ada5b 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+	cmpl $8, %edx
+	/* Less than 8 bytes? Go to byte copy loop */
+	jb .L_no_whole_words
+
+	/* Check for bad alignment of source */
+	testl $7, %esi
+	/* Already aligned */
+	jz .L_8byte_aligned
+
+	/* Copy one byte at a time until source is 8-byte aligned */
+	movl %esi, %ecx
+	andl $7, %ecx
+	subl $8, %ecx
+	negl %ecx
+	subl %ecx, %edx
+.L_copy_leading_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+	/* Figure out how many whole cache lines (64-bytes) to copy */
+	movl %edx, %ecx
+	andl $63, %edx
+	shrl $6, %ecx
+	jz .L_no_whole_cache_lines
+
+	/* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+	movq %r8, (%rdi)
+	movq %r9, 1*8(%rdi)
+	movq %r10, 2*8(%rdi)
+	movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+	movq %r8, 4*8(%rdi)
+	movq %r9, 5*8(%rdi)
+	movq %r10, 6*8(%rdi)
+	movq %r11, 7*8(%rdi)
+	leaq 64(%rsi), %rsi
+	leaq 64(%rdi), %rdi
+	decl %ecx
+	jnz .L_cache_w0
+
+	/* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+	movl %edx, %ecx
+	andl $7, %edx
+	shrl $3, %ecx
+	jz .L_no_whole_words
+
+	/* Copy trailing words */
+.L_copy_trailing_words:
+	movq (%rsi), %r8
+	mov %r8, (%rdi)
+	leaq 8(%rsi), %rsi
+	leaq 8(%rdi), %rdi
+	decl %ecx
+	jnz .L_copy_trailing_words
+
+	/* Any trailing bytes? */
+.L_no_whole_words:
+	andl %edx, %edx
+	jz .L_done_memcpy_trap
+
+	/* Copy trailing bytes */
+	movl %edx, %ecx
+.L_copy_trailing_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_trailing_bytes
+
+	/* Copy successful. Return true */
+.L_done_memcpy_trap:
+	xorq %rax, %rax
+	ret
+ENDPROC(memcpy_mcsafe)
+
+	.section .fixup, "ax"
+	/* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+	mov	$1, %rax
+	ret
+
+	.previous
+
+	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
-- 
cgit 


From b894157145e4ac7598d7062bc93320898a5e059e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 26 Feb 2016 09:15:11 -0600
Subject: x86/PCI: Mark Broadwell-EP Home Agent & PCU as having non-compliant
 BARs

The Home Agent and PCU PCI devices in Broadwell-EP have a non-BAR register
where a BAR should be.  We don't know what the side effects of sizing the
"BAR" would be, and we don't know what address space the "BAR" might appear
to describe.

Mark these devices as having non-compliant BARs so the PCI core doesn't
touch them.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Andi Kleen <ak@linux.intel.com>
CC: stable@vger.kernel.org
---
 arch/x86/pci/fixup.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e58565556703..0ae7e9fa348d 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -540,3 +540,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
         }
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+static void pci_bdwep_bar(struct pci_dev *dev)
+{
+	dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
-- 
cgit 


From 63e22924f5dc08badf9870c4a52ef4e7f7cbf218 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 1 Mar 2016 11:32:29 -0600
Subject: PCI: Mark shadow copy of VGA ROM as IORESOURCE_PCI_FIXED

A shadow copy of an option ROM is placed by the BIOS as a fixed address.
Set IORESOURCE_PCI_FIXED to indicate that we can't move the shadow copy.
This prevents warnings like the following when we assign resources:

  BAR 6: [??? 0x00000000 flags 0x2] has bogus alignment

This warning is emitted by pdev_sort_resources(), which already ignores
IORESOURCE_PCI_FIXED resources.

Link: http://lkml.kernel.org/r/CA+55aFyVMfTBB0oz_yx8+eQOEJnzGtCsYSj9QuhEpdZ9BHdq5A@mail.gmail.com
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/fixup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0ae7e9fa348d..dac027c1adb1 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -336,7 +336,8 @@ static void pci_fixup_video(struct pci_dev *pdev)
 	if (!vga_default_device() || pdev == vga_default_device()) {
 		pci_read_config_word(pdev, PCI_COMMAND, &config);
 		if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
-			pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
+			pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW |
+				IORESOURCE_PCI_FIXED;
 			dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
 		}
 	}
-- 
cgit 


From 0c0e0736acad4e76e718456c75d78ad95eea0011 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 1 Mar 2016 11:38:46 -0600
Subject: PCI: Set ROM shadow location in arch code, not in PCI core

IORESOURCE_ROM_SHADOW means there is a copy of a device's option ROM in
RAM.  The existence of such a copy and its location are arch-specific.
Previously the IORESOURCE_ROM_SHADOW flag was set in arch code, but the
0xC0000-0xDFFFF location was hard-coded into the PCI core.

If we're using a shadow copy in RAM, disable the ROM BAR and release the
address space it was consuming.  Move the location information from the PCI
core to the arch code that sets IORESOURCE_ROM_SHADOW.  Save the location
of the RAM copy in the struct resource for PCI_ROM_RESOURCE.

After this change, pci_map_rom() will call pci_assign_resource() and
pci_enable_rom() for these IORESOURCE_ROM_SHADOW resources, which we did
not do before.  This is safe because:

  - pci_assign_resource() will do nothing because the resource is marked
    IORESOURCE_PCI_FIXED, which means we can't move it, and

  - pci_enable_rom() will not turn on the ROM BAR's enable bit because the
    resource is marked IORESOURCE_ROM_SHADOW, which means it is in RAM
    rather than in PCI memory space.

Storing the location in the struct resource means "lspci" will show the
shadow location, not the value from the ROM BAR.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/fixup.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index dac027c1adb1..b7de1929714b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -297,14 +297,14 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_MCH_PC1,	pcie_r
  *
  * The standard boot ROM sequence for an x86 machine uses the BIOS
  * to select an initial video card for boot display. This boot video
- * card will have it's BIOS copied to C0000 in system RAM.
+ * card will have its BIOS copied to 0xC0000 in system RAM.
  * IORESOURCE_ROM_SHADOW is used to associate the boot video
  * card with this copy. On laptops this copy has to be used since
  * the main ROM may be compressed or combined with another image.
  * See pci_map_rom() for use of this flag. Before marking the device
  * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
- * by either arch cde or vga-arbitration, if so only apply the fixup to this
- * already determined primary video card.
+ * by either arch code or vga-arbitration; if so only apply the fixup to this
+ * already-determined primary video card.
  */
 
 static void pci_fixup_video(struct pci_dev *pdev)
@@ -312,6 +312,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
 	struct pci_dev *bridge;
 	struct pci_bus *bus;
 	u16 config;
+	struct resource *res;
 
 	/* Is VGA routed to us? */
 	bus = pdev->bus;
@@ -336,9 +337,18 @@ static void pci_fixup_video(struct pci_dev *pdev)
 	if (!vga_default_device() || pdev == vga_default_device()) {
 		pci_read_config_word(pdev, PCI_COMMAND, &config);
 		if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
-			pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW |
-				IORESOURCE_PCI_FIXED;
-			dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
+			res = &pdev->resource[PCI_ROM_RESOURCE];
+
+			pci_disable_rom(pdev);
+			if (res->parent)
+				release_resource(res);
+
+			res->start = 0xC0000;
+			res->end = res->start + 0x20000 - 1;
+			res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
+				     IORESOURCE_PCI_FIXED;
+			dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
+				 res);
 		}
 	}
 }
-- 
cgit 


From 5f8fc43217a01c98be88aa53794b4d1333a07a5a Mon Sep 17 00:00:00 2001
From: Bogicevic Sasa <brutallesale@gmail.com>
Date: Wed, 3 Feb 2016 13:24:22 -0800
Subject: PCI: Include pci/pcie/Kconfig directly from pci/Kconfig

Include pci/pcie/Kconfig directly from pci/Kconfig, so arches don't
have to source both pci/Kconfig and pci/pcie/Kconfig.

Note that this effectively adds pci/pcie/Kconfig to the following
arches, because they already sourced drivers/pci/Kconfig but they
previously did not source drivers/pci/pcie/Kconfig:

  alpha
  avr32
  blackfin
  frv
  m32r
  m68k
  microblaze
  mn10300
  parisc
  sparc
  unicore32
  xtensa

[bhelgaas: changelog, source pci/pcie/Kconfig at top of pci/Kconfig, whitespace]
Signed-off-by: Sasa Bogicevic <brutallesale@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/Kconfig | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 330e738ccfc1..ff7fe2dc881c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2431,8 +2431,6 @@ config PCI_CNB20LE_QUIRK
 
 	  You should say N unless you know you need this.
 
-source "drivers/pci/pcie/Kconfig"
-
 source "drivers/pci/Kconfig"
 
 # x86_64 have no ISA slots, but can have ISA-style DMA.
-- 
cgit 


From e7e127e3c767094aca96ee976729dc5f756666df Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 8 Mar 2016 14:57:21 -0600
Subject: PCI: Include pci/hotplug Kconfig directly from pci/Kconfig

Include pci/hotplug/Kconfig directly from pci/Kconfig, so arches don't
have to source both pci/Kconfig and pci/hotplug/Kconfig.

Note that this effectively adds pci/hotplug/Kconfig to the following
arches, because they already sourced drivers/pci/Kconfig but they
previously did not source drivers/pci/hotplug/Kconfig:

  alpha
  arm
  avr32
  frv
  m68k
  microblaze
  mn10300
  sparc
  unicore32

Inspired-by-patch-from: Bogicevic Sasa <brutallesale@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/Kconfig | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ff7fe2dc881c..1912637d6178 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2586,8 +2586,6 @@ config AMD_NB
 
 source "drivers/pcmcia/Kconfig"
 
-source "drivers/pci/hotplug/Kconfig"
-
 config RAPIDIO
 	tristate "RapidIO support"
 	depends on PCI
-- 
cgit 


From f363938c70a04e6bc99023a5e0c44ef7879b903f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 21 Jan 2016 15:24:31 -0800
Subject: x86/fpu: Fix 'no387' regression

After fixing FPU option parsing, we now parse the 'no387' boot option
too early: no387 clears X86_FEATURE_FPU before it's even probed, so
the boot CPU promptly re-enables it.

I suspect it gets even more confused on SMP.

Fix the probing code to leave X86_FEATURE_FPU off if it's been
disabled by setup_clear_cpu_cap().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Fixes: 4f81cbafcce2 ("x86/fpu: Fix early FPU command-line parsing")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6d9f0a7ef4c8..d53ab3d3b8e8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
 	write_cr0(cr0);
 
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
+	if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+		asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+			     : "+m" (fsw), "+m" (fcw));
 
-	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-		set_cpu_cap(c, X86_FEATURE_FPU);
-	else
-		clear_cpu_cap(c, X86_FEATURE_FPU);
+		if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+			set_cpu_cap(c, X86_FEATURE_FPU);
+		else
+			clear_cpu_cap(c, X86_FEATURE_FPU);
+	}
 
 #ifndef CONFIG_MATH_EMULATION
 	if (!cpu_has_fpu) {
-- 
cgit 


From a87036add09283e6c4f4103a15c596c67b86ab86 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Mar 2016 09:52:13 +0100
Subject: KVM: x86: disable MPX if host did not enable MPX XSAVE features

When eager FPU is disabled, KVM will still see the MPX bit in CPUID and
presumably the MPX vmentry and vmexit controls.  However, it will not
be able to expose the MPX XSAVE features to the guest, because the guest's
accessible XSAVE features are always a subset of host_xcr0.

In this case, we should disable the MPX CPUID bit, the BNDCFGS MSR,
and the MPX vmentry and vmexit controls for nested virtualization.
It is then unnecessary to enable guest eager FPU if the guest has the
MPX CPUID bit set.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 13 ++++++++++---
 arch/x86/kvm/cpuid.h |  9 +--------
 arch/x86/kvm/vmx.c   | 13 ++++++-------
 3 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6525e926f566..fa241d4fda98 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -46,11 +46,18 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 	return ret;
 }
 
+bool kvm_mpx_supported(void)
+{
+	return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+		 && kvm_x86_ops->mpx_supported());
+}
+EXPORT_SYMBOL_GPL(kvm_mpx_supported);
+
 u64 kvm_supported_xcr0(void)
 {
 	u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
 
-	if (!kvm_x86_ops->mpx_supported())
+	if (!kvm_mpx_supported())
 		xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
 	return xcr0;
@@ -97,7 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+	vcpu->arch.eager_fpu = use_eager_fpu();
 	if (vcpu->arch.eager_fpu)
 		kvm_x86_ops->fpu_activate(vcpu);
 
@@ -295,7 +302,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #endif
 	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
 	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-	unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
 	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 
 	/* cpuid 1.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c8eda1498121..66a6581724ad 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -5,6 +5,7 @@
 #include <asm/cpu.h>
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+bool kvm_mpx_supported(void);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 					      u32 function, u32 index);
 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -135,14 +136,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
 
-static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_MPX));
-}
-
 static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46154dac71e6..e512aa7ed874 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -861,7 +861,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2595,7 +2594,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
 	/* We support free control of debug control saving. */
@@ -2616,7 +2615,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VM_ENTRY_LOAD_IA32_PAT;
 	vmx->nested.nested_vmx_entry_ctls_high |=
 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
 	/* We support free control of debug control loading. */
@@ -2860,7 +2859,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!vmx_mpx_supported())
+		if (!kvm_mpx_supported())
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
@@ -2937,7 +2936,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!vmx_mpx_supported())
+		if (!kvm_mpx_supported())
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
@@ -3410,7 +3409,7 @@ static void init_vmcs_shadow_fields(void)
 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 		switch (shadow_read_write_fields[i]) {
 		case GUEST_BNDCFGS:
-			if (!vmx_mpx_supported())
+			if (!kvm_mpx_supported())
 				continue;
 			break;
 		default:
@@ -10265,7 +10264,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 	if (nested_cpu_has_xsaves(vmcs12))
 		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
-- 
cgit 


From 5a5fbdc0e3f1159a734f1890da60fce70e98271d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Mar 2016 10:00:11 +0100
Subject: KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch

It is now equal to use_eager_fpu(), which simply tests a cpufeature bit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c            | 3 +--
 arch/x86/kvm/x86.c              | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d110dc44d6c2..01c8b501cb6d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -503,7 +503,6 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
 	struct fpu guest_fpu;
-	bool eager_fpu;
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u32 guest_xstate_size;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index fa241d4fda98..0029644bf09c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	vcpu->arch.eager_fpu = use_eager_fpu();
-	if (vcpu->arch.eager_fpu)
+	if (use_eager_fpu())
 		kvm_x86_ops->fpu_activate(vcpu);
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 60d6c0036a98..bcbce0fa0bc2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7329,7 +7329,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	 * Every 255 times fpu_counter rolls over to 0; a guest that uses
 	 * the FPU in bursts will revert to loading it on demand.
 	 */
-	if (!vcpu->arch.eager_fpu) {
+	if (!use_eager_fpu()) {
 		if (++vcpu->fpu_counter < 5)
 			kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	}
-- 
cgit 


From 8b30a8b3c636a155bab9176ad209964c9c22252d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 9 Mar 2016 14:48:21 +0100
Subject: x86/defconfigs/32: Set CONFIG_FRAME_WARN to the Kconfig default

Sync it to the Kconfig default for 32-bit.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: tim.gardner@canonical.com
Link: http://lkml.kernel.org/r/20160309134821.GD6564@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/configs/i386_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 028be48c8839..e25a1630320c 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_FRAME_WARN=2048
+CONFIG_FRAME_WARN=1024
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_DEBUG_KERNEL=y
-- 
cgit 


From 59e6473980f321c16299e12db69d1fabc2644a6f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Mar 2016 07:16:07 -0800
Subject: libnvdimm, pmem: clear poison on write

If a write is directed at a known bad block perform the following:

1/ write the data

2/ send a clear poison command

3/ invalidate the poison out of the cache hierarchy

Cc: <x86@kernel.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index c57fd1ea9689..bf8b35d2035a 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -137,6 +137,11 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	arch_wb_cache_pmem(addr, size);
 }
 
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+	clflush_cache_range((void __force *) addr, size);
+}
+
 static inline bool __arch_has_wmb_pmem(void)
 {
 	/*
-- 
cgit 


From e786041153df6343169373177248abfab5c5ac1b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:25 -0800
Subject: x86/entry/compat: In SYSENTER, sink AC clearing below the existing
 FLAGS test

CLAC is slow, and the SYSENTER code already has an unlikely path
that runs if unusual flags are set.  Drop the CLAC and instead rely
on the unlikely path to clear AC.

This seems to save ~24 cycles on my Skylake laptop.  (Hey, Intel,
make this faster please!)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/90d6db2189f9add83bc7bddd75a0c19ebbd676b2.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990eeee40b..da5c19443e0b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -66,8 +66,6 @@ ENTRY(entry_SYSENTER_compat)
 	 */
 	pushfq				/* pt_regs->flags (except IF = 0) */
 	orl	$X86_EFLAGS_IF, (%rsp)	/* Fix saved flags */
-	ASM_CLAC			/* Clear AC after saving FLAGS */
-
 	pushq	$__USER32_CS		/* pt_regs->cs */
 	xorq    %r8,%r8
 	pushq	%r8			/* pt_regs->ip = 0 (placeholder) */
@@ -90,9 +88,9 @@ ENTRY(entry_SYSENTER_compat)
 	cld
 
 	/*
-	 * Sysenter doesn't filter flags, so we need to clear NT
+	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
 	 * ourselves.  To save a few cycles, we can check whether
-	 * NT was set instead of doing an unconditional popfq.
+	 * either was set instead of doing an unconditional popfq.
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
@@ -102,7 +100,7 @@ ENTRY(entry_SYSENTER_compat)
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
-- 
cgit 


From 67f590e8d4d718d9bd377b39223f7f69678d6a10 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:26 -0800
Subject: x86/entry/32: Filter NT and speed up AC filtering in SYSENTER

This makes the 32-bit code work just like the 64-bit code.  It should
speed up syscalls on 32-bit kernels on Skylake by something like 20
cycles (by analogy to the 64-bit compat case).

It also cleans up NT just like we do for the 64-bit case.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/07daef3d44bd1ed62a2c866e143e8df64edb40ee.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e13027247c90..8daa8127f578 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,13 +294,29 @@ sysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
 	pushfl				/* pt_regs->flags (except IF = 0) */
-	ASM_CLAC			/* Clear AC after saving FLAGS */
 	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
 	pushl	%eax			/* pt_regs->orig_ax */
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
+	/*
+	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
+	 * ourselves.  To save a few cycles, we can check whether
+	 * either was set instead of doing an unconditional popfq.
+	 * This needs to happen before enabling interrupts so that
+	 * we don't get preempted with NT set.
+	 *
+	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+	 * out-of-line as an optimization: NT is unlikely to be set in the
+	 * majority of the cases and instead of polluting the I$ unnecessarily,
+	 * we're keeping that code behind a branch which will predict as
+	 * not-taken and therefore its instructions won't be fetched.
+	 */
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp)
+	jnz	.Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
@@ -339,6 +355,11 @@ sysenter_past_esp:
 .popsection
 	_ASM_EXTABLE(1b, 2b)
 	PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+	pushl	$X86_EFLAGS_FIXED
+	popfl
+	jmp	.Lsysenter_flags_fixed
 ENDPROC(entry_SYSENTER_32)
 
 	# system call handler stub
-- 
cgit 


From c2c9b52fab0d0cf993476ed4c34f24da5a1205ae Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:27 -0800
Subject: x86/entry/32: Restore FLAGS on SYSEXIT

We weren't restoring FLAGS at all on SYSEXIT.  Apparently no one cared.

With this patch applied, native kernels should always honor
task_pt_regs()->flags, which opens the door for some sys_iopl()
cleanups.  I'll do those as a separate series, though, since getting
it right will involve tweaking some paravirt ops.

( The short version is that, before this patch, sys_iopl(), invoked via
  SYSENTER, wasn't guaranteed to ever transfer the updated
  regs->flags, so sys_iopl() had to change the hardware flags register
  as well. )

Reported-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3f98b207472dc9784838eb5ca2b89dcc845ce269.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8daa8127f578..76109068149f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -342,6 +342,15 @@ sysenter_past_esp:
 	popl	%ebp			/* pt_regs->bp */
 	popl	%eax			/* pt_regs->ax */
 
+	/*
+	 * Restore all flags except IF. (We restore IF separately because
+	 * STI gives a one-instruction window in which we won't be interrupted,
+	 * whereas POPF does not.)
+	 */
+	addl	$PT_EFLAGS-PT_DS, %esp	/* point esp at pt_regs->flags */
+	btr	$X86_EFLAGS_IF_BIT, (%esp)
+	popfl
+
 	/*
 	 * Return back to the vDSO, which will pop ecx and edx.
 	 * Don't bother with DS and ES (they already contain __USER_DS).
-- 
cgit 


From 81edd9f69a6fd214fdbe66b43de6aa1610c84c63 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:28 -0800
Subject: x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions

The SDM says that debug exceptions clear BTF, and we need to keep
TIF_BLOCKSTEP in sync with BTF.  Clear it unconditionally and improve
the comment.

I suspect that the fact that kmemcheck could cause TIF_BLOCKSTEP not
to be cleared was just an oversight.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/fa86e55d196e6dde5b38839595bde2a292c52fdc.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 410e8e2700c5..1af56217fb5d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -597,6 +597,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	/* Filter out all the reserved bits which are preset to 1 */
 	dr6 &= ~DR6_RESERVED;
 
+	/*
+	 * The SDM says "The processor clears the BTF flag when it
+	 * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
+	 * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
 	/*
 	 * If dr6 has no reason to give us about the origin of this trap,
 	 * then it's very likely the result of an icebp/int01 trap.
@@ -612,11 +619,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
 
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit 


From 8bb5643686d2898bec181c407651cf84e77d413f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:29 -0800
Subject: x86/entry/traps: Clear DR6 early in do_debug() and improve the
 comment

Leaving any bits set in DR6 on return from a debug exception is
asking for trouble.  Prevent it by writing zero right away and
clarify the comment.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3857676e1be8fb27db4b89bbb1e2052b7f435ff4.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1af56217fb5d..7394be8b1a66 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -593,6 +593,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
+	/*
+	 * The Intel SDM says:
+	 *
+	 *   Certain debug exceptions may clear bits 0-3. The remaining
+	 *   contents of the DR6 register are never cleared by the
+	 *   processor. To avoid confusion in identifying debug
+	 *   exceptions, debug handlers should clear the register before
+	 *   returning to the interrupted task.
+	 *
+	 * Keep it simple: clear DR6 immediately.
+	 */
+	set_debugreg(0, 6);
 
 	/* Filter out all the reserved bits which are preset to 1 */
 	dr6 &= ~DR6_RESERVED;
@@ -616,9 +628,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 		goto exit;
 
-	/* DR6 may or may not be cleared by the CPU */
-	set_debugreg(0, 6);
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit 


From f2b375756c839ff655a3e0b45e339f8fbf973093 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:30 -0800
Subject: x86/entry: Vastly simplify SYSENTER TF (single-step) handling

Due to a blatant design error, SYSENTER doesn't clear TF (single-step).

As a result, if a user does SYSENTER with TF set, we will single-step
through the kernel until something clears TF.  There is absolutely
nothing we can do to prevent this short of turning off SYSENTER [1].

Simplify the handling considerably with two changes:

  1. We already sanitize EFLAGS in SYSENTER to clear NT and AC.  We can
     add TF to that list of flags to sanitize with no overhead whatsoever.

  2. Teach do_debug() to ignore single-step traps in the SYSENTER prologue.

That's all we need to do.

Don't get too excited -- our handling is still buggy on 32-bit
kernels.  There's nothing wrong with the SYSENTER code itself, but
the #DB prologue has a clever fixup for traps on the very first
instruction of entry_SYSENTER_32, and the fixup doesn't work quite
correctly.  The next two patches will fix that.

[1] We could probably prevent it by forcing BTF on at all times and
    making sure we clear TF before any branches in the SYSENTER
    code.  Needless to say, this is a bad idea.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a30d2ea06fe4b621fe6a9ef911b02c0f38feb6f2.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S        | 42 ++++++++++++++++++++++----------
 arch/x86/entry/entry_64_compat.S |  9 ++++++-
 arch/x86/include/asm/proto.h     | 15 ++++++++++--
 arch/x86/kernel/traps.c          | 52 +++++++++++++++++++++++++++++++++-------
 4 files changed, 94 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 76109068149f..b2e1d446bdf8 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -287,7 +287,26 @@ need_resched:
 END(resume_kernel)
 #endif
 
-	# SYSENTER  call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+	addl	$5*4, %esp			/* remove xen-provided frame */
+	jmp	sysenter_past_esp
+#endif
+
 ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
@@ -301,19 +320,25 @@ sysenter_past_esp:
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
 	/*
-	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
-	 * ourselves.  To save a few cycles, we can check whether
+	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
+	 * and TF ourselves.  To save a few cycles, we can check whether
 	 * either was set instead of doing an unconditional popfq.
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
 	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 	 * out-of-line as an optimization: NT is unlikely to be set in the
 	 * majority of the cases and instead of polluting the I$ unnecessarily,
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
@@ -369,6 +394,7 @@ sysenter_past_esp:
 	pushl	$X86_EFLAGS_FIXED
 	popfl
 	jmp	.Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
 
 	# system call handler stub
@@ -651,14 +677,6 @@ ENTRY(spurious_interrupt_bug)
 END(spurious_interrupt_bug)
 
 #ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
-	addl	$5*4, %esp			/* remove xen-provided frame */
-	jmp	sysenter_past_esp
-
 ENTRY(xen_hypervisor_callback)
 	pushl	$-1				/* orig_ax = -1 => not a system call */
 	SAVE_ALL
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index da5c19443e0b..cbb5e6943d9b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -94,13 +94,19 @@ ENTRY(entry_SYSENTER_compat)
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
 	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 	 * out-of-line as an optimization: NT is unlikely to be set in the
 	 * majority of the cases and instead of polluting the I$ unnecessarily,
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
@@ -121,6 +127,7 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	$X86_EFLAGS_FIXED
 	popfq
 	jmp	.Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 
 /*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a4a77286cb1d..9b9b30b19441 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -7,12 +7,23 @@
 
 void syscall_init(void);
 
+#ifdef CONFIG_X86_64
 void entry_SYSCALL_64(void);
-void entry_SYSCALL_compat(void);
+#endif
+
+#ifdef CONFIG_X86_32
 void entry_INT80_32(void);
-void entry_INT80_compat(void);
 void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
 void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#endif
 
 void x86_configure_nx(void);
 void x86_report_nx(void);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7394be8b1a66..b0ddb81926f9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -559,6 +559,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 NOKPROBE_SYMBOL(fixup_bad_iret);
 #endif
 
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+	/*
+	 * We don't try for precision here.  If we're anywhere in the region of
+	 * code that can be single-stepped in the SYSENTER entry path, then
+	 * assume that this is a useless single-step trap due to SYSENTER
+	 * being invoked with TF set.  (We don't know in advance exactly
+	 * which instructions will be hit because BTF could plausibly
+	 * be set.)
+	 */
+#ifdef CONFIG_X86_32
+	return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+		(unsigned long)__end_SYSENTER_singlestep_region -
+		(unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+	return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+		(unsigned long)__end_entry_SYSENTER_compat -
+		(unsigned long)entry_SYSENTER_compat;
+#else
+	return false;
+#endif
+}
+
 /*
  * Our handling of the processor debug registers is non-trivial.
  * We do not clear them on entry and exit from the kernel. Therefore
@@ -616,6 +639,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 */
 	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
 
+	if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+		     is_sysenter_singlestep(regs))) {
+		dr6 &= ~DR_STEP;
+		if (!dr6)
+			goto exit;
+		/*
+		 * else we might have gotten a single-step trap and hit a
+		 * watchpoint at the same time, in which case we should fall
+		 * through and handle the watchpoint.
+		 */
+	}
+
 	/*
 	 * If dr6 has no reason to give us about the origin of this trap,
 	 * then it's very likely the result of an icebp/int01 trap.
@@ -624,7 +659,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
-	/* Catch kmemcheck conditions first of all! */
+	/* Catch kmemcheck conditions! */
 	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 		goto exit;
 
@@ -659,14 +694,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 		goto exit;
 	}
 
-	/*
-	 * Single-stepping through system calls: ignore any exceptions in
-	 * kernel space, but re-enable TF when returning to user mode.
-	 *
-	 * We already checked v86 mode above, so we can check for kernel mode
-	 * by just checking the CPL of CS.
-	 */
-	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+	if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+		/*
+		 * Historical junk that used to handle SYSENTER single-stepping.
+		 * This should be unreachable now.  If we survive for a while
+		 * without anyone hitting this warning, we'll turn this into
+		 * an oops.
+		 */
 		tsk->thread.debugreg6 &= ~DR_STEP;
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
-- 
cgit 


From 6dcc94149d605908a7c0c4cf2085340637aac86d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:31 -0800
Subject: x86/entry: Only allocate space for tss_struct::SYSENTER_stack if
 needed

The SYSENTER stack is only used on 32-bit kernels.  Remove it on 64-bit kernels.

( We may end up using it down the road on 64-bit kernels. If so,
  we'll re-enable it for CONFIG_IA32_EMULATION. )

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9dbd18429f9ff61a76b6eda97a9ea20510b9f6ba.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ecb410310e70..7cd01b71b5bd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -297,10 +297,12 @@ struct tss_struct {
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Space for the temporary SYSENTER stack:
 	 */
 	unsigned long		SYSENTER_stack[64];
+#endif
 
 } ____cacheline_aligned;
 
-- 
cgit 


From 7536656f08d0c1a3b4c487d00785c5186ec6f533 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:32 -0800
Subject: x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
so the exception handler is invoked on the temporary SYSENTER stack.

Because the SYSENTER stack is very small, we have a fixup to switch
off the stack quickly when this happens.  The old fixup had several issues:

 1. It checked the interrupt frame's CS and EIP.  This wasn't
    obviously correct on Xen or if vm86 mode was in use [1].

 2. In the NMI handler, it did some frightening digging into the
    stack frame.  I'm not convinced this digging was correct.

 3. The fixup didn't switch stacks and then switch back.  Instead, it
    synthesized a brand new stack frame that would redirect the IRET
    back to the SYSENTER code.  That frame was highly questionable.
    For one thing, if NMI nested inside #DB, we would effectively
    abort the #DB prologue, which was probably safe but was
    frightening.  For another, the code used PUSHFL to write the
    FLAGS portion of the frame, which was simply bogus -- by the time
    PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
    flags were clobbered.

Simplify this considerably.  Instead of looking at the saved frame
to see where we came from, check the hardware ESP register against
the SYSENTER stack directly.  Malicious user code cannot spoof the
kernel ESP register, and by moving the check after SAVE_ALL, we can
use normal PER_CPU accesses to find all the relevant addresses.

With this patch applied, the improved syscall_nt_32 test finally
passes on 32-bit kernels.

[1] It isn't obviously correct, but it is nonetheless safe from vm86
    shenanigans as far as I can tell.  A user can't point EIP at
    entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
    like all kernel addresses, is greater than 0xffff and would thus
    violate the CS segment limit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b2cdbc037031c07ecf2c40a96069318aec0e7971.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S        | 114 ++++++++++++++++++---------------------
 arch/x86/kernel/asm-offsets_32.c |   5 ++
 2 files changed, 56 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2e1d446bdf8..7b3ec24ede82 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -976,51 +976,48 @@ error_code:
 	jmp	ret_from_exception
 END(page_fault)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-	cmpw	$__KERNEL_CS, 4(%esp)
-	jne	\ok
-\label:
-	movl	TSS_sysenter_sp0 + \offset(%esp), %esp
-	pushfl
-	pushl	$__KERNEL_CS
-	pushl	$sysenter_past_esp
-.endm
-
 ENTRY(debug)
+	/*
+	 * #DB can happen at the first instruction of
+	 * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+	 * happens, then we will be running on a very small stack.  We
+	 * need to detect this condition and switch to the thread
+	 * stack before calling any C code at all.
+	 *
+	 * If you edit this code, keep in mind that NMIs can happen in here.
+	 */
 	ASM_CLAC
-	cmpl	$entry_SYSENTER_32, (%esp)
-	jne	debug_stack_correct
-	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
 	pushl	$-1				# mark this as an int
 	SAVE_ALL
-	TRACE_IRQS_OFF
 	xorl	%edx, %edx			# error code 0
 	movl	%esp, %eax			# pt_regs pointer
+
+	/* Are we currently on the SYSENTER stack? */
+	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
+	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	jb	.Ldebug_from_sysenter_stack
+
+	TRACE_IRQS_OFF
+	call	do_debug
+	jmp	ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+	/* We're on the SYSENTER stack.  Switch off. */
+	movl	%esp, %ebp
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
+	TRACE_IRQS_OFF
 	call	do_debug
+	movl	%ebp, %esp
 	jmp	ret_from_exception
 END(debug)
 
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is doubly nasty.  It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks.  We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
  */
 ENTRY(nmi)
 	ASM_CLAC
@@ -1031,41 +1028,32 @@ ENTRY(nmi)
 	popl	%eax
 	je	nmi_espfix_stack
 #endif
-	cmpl	$entry_SYSENTER_32, (%esp)
-	je	nmi_stack_fixup
-	pushl	%eax
-	movl	%esp, %eax
-	/*
-	 * Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl	$(THREAD_SIZE-1), %eax
-	cmpl	$(THREAD_SIZE-20), %eax
-	popl	%eax
-	jae	nmi_stack_correct
-	cmpl	$entry_SYSENTER_32, 12(%esp)
-	je	nmi_debug_stack_check
-nmi_stack_correct:
-	pushl	%eax
+
+	pushl	%eax				# pt_regs->orig_ax
 	SAVE_ALL
 	xorl	%edx, %edx			# zero error code
 	movl	%esp, %eax			# pt_regs pointer
+
+	/* Are we currently on the SYSENTER stack? */
+	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
+	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	jb	.Lnmi_from_sysenter_stack
+
+	/* Not on SYSENTER stack. */
 	call	do_nmi
 	jmp	restore_all_notrace
 
-nmi_stack_fixup:
-	FIX_STACK 12, nmi_stack_correct, 1
-	jmp	nmi_stack_correct
-
-nmi_debug_stack_check:
-	cmpw	$__KERNEL_CS, 16(%esp)
-	jne	nmi_stack_correct
-	cmpl	$debug, (%esp)
-	jb	nmi_stack_correct
-	cmpl	$debug_esp_fix_insn, (%esp)
-	ja	nmi_stack_correct
-	FIX_STACK 24, nmi_stack_correct, 1
-	jmp	nmi_stack_correct
+.Lnmi_from_sysenter_stack:
+	/*
+	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+	 * is using the thread stack right now, so it's safe for us to use it.
+	 */
+	movl	%esp, %ebp
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
+	call	do_nmi
+	movl	%ebp, %esp
+	jmp	restore_all_notrace
 
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fdeb0ce07c16..ecdc1d217dc0 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -52,6 +52,11 @@ void foo(void)
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 	       offsetofend(struct tss_struct, SYSENTER_stack));
 
+	/* Offset from cpu_tss to SYSENTER_stack */
+	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+	/* Size of SYSENTER_stack */
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-- 
cgit 


From 2a41aa4feb25af3ead60b740c43df80c576efea2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:33 -0800
Subject: x86/entry/32: Add and check a stack canary for the SYSENTER stack

The first instruction of the SYSENTER entry runs on its own tiny
stack.  That stack can be used if a #DB or NMI is delivered before
the SYSENTER prologue switches to a real stack.

We have code in place to prevent us from overflowing the tiny stack.
For added paranoia, add a canary to the stack and check it in
do_debug() -- that way, if something goes wrong with the #DB logic,
we'll eventually notice.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/6ff9a806f39098b166dc2c41c1db744df5272f29.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 3 ++-
 arch/x86/kernel/process.c        | 3 +++
 arch/x86/kernel/traps.c          | 8 ++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7cd01b71b5bd..50a6dc871cc0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -299,8 +299,9 @@ struct tss_struct {
 
 #ifdef CONFIG_X86_32
 	/*
-	 * Space for the temporary SYSENTER stack:
+	 * Space for the temporary SYSENTER stack.
 	 */
+	unsigned long		SYSENTER_stack_canary;
 	unsigned long		SYSENTER_stack[64];
 #endif
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..ee9a9792caeb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	  */
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
+#ifdef CONFIG_X86_32
+	.SYSENTER_stack_canary	= STACK_END_MAGIC,
+#endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b0ddb81926f9..49e2e775f507 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -713,6 +713,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
+#if defined(CONFIG_X86_32)
+	/*
+	 * This is the most likely code path that involves non-trivial use
+	 * of the SYSENTER stack.  Check that we haven't overrun it.
+	 */
+	WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+	     "Overran or corrupted SYSENTER stack\n");
+#endif
 	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
-- 
cgit 


From 392a62549fbb80da48811d04391615a25c39d091 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:34 -0800
Subject: x86/entry: Remove TIF_SINGLESTEP entry work

Now that SYSENTER with TF set puts X86_EFLAGS_TF directly into
regs->flags, we don't need a TIF_SINGLESTEP fixup in the syscall
entry code.  Remove it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2d15f24da52dafc9d2f0b8d76f55544f4779c517.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c            | 10 ----------
 arch/x86/include/asm/thread_info.h |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1a000f59f04a..209ba334e90f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -172,16 +172,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 		BUG_ON(regs != task_pt_regs(current));
 
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Call seccomp_phase2 before running the other hooks so that
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c0778fcab06d..f2e2302c406f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -137,7 +137,7 @@ struct thread_info {
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |	\
 	 _TIF_NOHZ)
 
 /* work to do on any return to user space */
-- 
cgit 


From fda57b2267e12de014069b1596a5438cf76fc7c6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:35 -0800
Subject: x86/entry: Improve system call entry comments

Ingo suggested that the comments should explain when the various
entries are used.  This adds these explanations and improves other
parts of the comments.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9524ecef7a295347294300045d08354d6a57c6e7.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S        | 61 +++++++++++++++++++++++++++-
 arch/x86/entry/entry_64.S        | 10 +++++
 arch/x86/entry/entry_64_compat.S | 85 +++++++++++++++++++++++++++-------------
 3 files changed, 128 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b3ec24ede82..286efa342091 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -307,6 +307,38 @@ ENTRY(xen_sysenter_target)
 	jmp	sysenter_past_esp
 #endif
 
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
 ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
@@ -397,7 +429,34 @@ sysenter_past_esp:
 GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
 
-	# system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
 ENTRY(entry_INT80_32)
 	ASM_CLAC
 	pushl	%eax			/* pt_regs->orig_ax */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 70eadb0ea5fa..858b555e274b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
 /*
  * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
  * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index cbb5e6943d9b..efe0fe3f76bc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
 	.section .entry.text, "ax"
 
 /*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
  *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
  * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
  *
  * Arguments:
  * eax  system call number
@@ -35,10 +44,6 @@
  * edi  arg5
  * ebp  user stack
  * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
  */
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
@@ -131,17 +136,38 @@ GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 
 /*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
+ *
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ *    user task that did SYSCALL without immediately reloading SS
+ *    would randomly crash.
  *
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
  *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
  * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
  * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
  * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
  *
@@ -241,7 +267,21 @@ sysret32_from_system_call:
 END(entry_SYSCALL_compat)
 
 /*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
  *
  * Arguments:
  * eax  system call number
@@ -250,17 +290,8 @@ END(entry_SYSCALL_compat)
  * edx  arg3
  * esi  arg4
  * edi  arg5
- * ebp  arg6	(note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
+ * ebp  arg6
  */
-
 ENTRY(entry_INT80_compat)
 	/*
 	 * Interrupts are off on entry.
-- 
cgit 


From a65050c6f17e52442716138d48d0a47301a8344b Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Wed, 9 Mar 2016 16:28:54 -0800
Subject: x86/fpu: Revert ("x86/fpu: Disable AVX when eagerfpu is off")

Leonid Shatz noticed that the SDM interpretation of the following
recent commit:

  394db20ca240741 ("x86/fpu: Disable AVX when eagerfpu is off")

... is incorrect and that the original behavior of the FPU code was correct.

Because AVX is not stated in CR0 TS bit description, it was mistakenly
believed to be not supported for lazy context switch. This turns out
to be false:

  Intel Software Developer's Manual Vol. 3A, Sec. 2.5 Control Registers:

   'TS Task Switched bit (bit 3 of CR0) -- Allows the saving of the x87 FPU/
    MMX/SSE/SSE2/SSE3/SSSE3/SSE4 context on a task switch to be delayed until
    an x87 FPU/MMX/SSE/SSE2/SSE3/SSSE3/SSE4 instruction is actually executed
    by the new task.'

  Intel Software Developer's Manual Vol. 2A, Sec. 2.4 Instruction Exception
  Specification:

   'AVX instructions refer to exceptions by classes that include #NM
    "Device Not Available" exception for lazy context switch.'

So revert the commit.

Reported-by: Leonid Shatz <leonid.shatz@ravellosystems.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457569734-3785-1-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/xstate.h | 9 ++++-----
 arch/x86/kernel/fpu/init.c        | 6 ------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index af30fdeb140d..f23cd8c80b1c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,16 +20,15 @@
 
 /* Supported features which support lazy state saving */
 #define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR | \
+				 XFEATURE_MASK_SSE | \
 				 XFEATURE_MASK_YMM | \
 				 XFEATURE_MASK_OPMASK | \
 				 XFEATURE_MASK_ZMM_Hi256 | \
 				 XFEATURE_MASK_Hi16_ZMM)
 
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
 /* All currently supported features */
 #define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d53ab3d3b8e8..9ee7e307b18f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -302,12 +302,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
 static void __init fpu__clear_eager_fpu_features(void)
 {
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 }
 
 /*
-- 
cgit 


From a798f091113ef4999277dbe0483d37d04fa35b2e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 13:24:32 -0800
Subject: x86/entry/32: Change INT80 to be an interrupt gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want all of the syscall entries to run with interrupts off so that
we can efficiently run context tracking before enabling interrupts.

This will regress int $0x80 performance on 32-bit kernels by a
couple of cycles.  This shouldn't matter much -- int $0x80 is not a
fast path.

This effectively reverts:

  657c1eea0019 ("x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on")

... and fixes the same issue differently.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/59b4f90c9ebfccd8c937305dbbbca680bc74b905.1457558566.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c          | 15 +++------------
 arch/x86/entry/entry_32.S        |  8 ++++----
 arch/x86/entry/entry_64_compat.S |  2 +-
 arch/x86/kernel/traps.c          |  2 +-
 4 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 209ba334e90f..d69d1b6e6c31 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -371,14 +371,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
  * in workloads that use it, and it's usually called from
  * do_fast_syscall_32, so forcibly inline it to improve performance.
  */
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 {
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned int nr = (unsigned int)regs->orig_ax;
@@ -413,14 +406,12 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 	syscall_return_slowpath(regs);
 }
 
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
 {
 	local_irq_enable();
 	do_syscall_32_irqs_on(regs);
 }
-#endif
 
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible long do_fast_syscall_32(struct pt_regs *regs)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 286efa342091..10868aa734dc 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -463,13 +463,13 @@ ENTRY(entry_INT80_32)
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
 	/*
-	 * User mode is traced as though IRQs are on.  Unlike the 64-bit
-	 * case, INT80 is a trap gate on 32-bit kernels, so interrupts
-	 * are already on (unless user code is messing around with iopl).
+	 * User mode is traced as though IRQs are on, and the interrupt gate
+	 * turned them off.
 	 */
+	TRACE_IRQS_OFF
 
 	movl	%esp, %eax
-	call	do_syscall_32_irqs_on
+	call	do_int80_syscall_32
 .Lsyscall_32_done:
 
 restore_all:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index efe0fe3f76bc..847f2f0c31e5 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -336,7 +336,7 @@ ENTRY(entry_INT80_compat)
 	TRACE_IRQS_OFF
 
 	movq	%rsp, %rdi
-	call	do_syscall_32_irqs_off
+	call	do_int80_syscall_32
 .Lsyscall_32_done:
 
 	/* Go back to user mode. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 49e2e775f507..5fae2d840e64 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -912,7 +912,7 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
-- 
cgit 


From 9999c8c01f34c918a57d6e5ba2f5d8b79aa04801 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 13:24:33 -0800
Subject: x86/entry: Call enter_from_user_mode() with IRQs off
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that slow-path syscalls always enter C before enabling
interrupts, it's straightforward to call enter_from_user_mode() before
enabling interrupts rather than doing it as part of entry tracing.

With this change, we should finally be able to retire exception_enter().

This will also enable optimizations based on knowing that we never
change context tracking state with interrupts on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/bc376ecf87921a495e874ff98139b1ca2f5c5dd7.1457558566.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c            | 33 +++++++++++----------------------
 arch/x86/include/asm/thread_info.h |  5 ++++-
 2 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d69d1b6e6c31..e79d93d44ecd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -45,6 +45,8 @@ __visible void enter_from_user_mode(void)
 	CT_WARN_ON(ct_state() != CONTEXT_USER);
 	user_exit();
 }
+#else
+static inline void enter_from_user_mode(void) {}
 #endif
 
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -85,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 
 	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
 
-#ifdef CONFIG_CONTEXT_TRACKING
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		enter_from_user_mode();
-		work &= ~_TIF_NOHZ;
-	}
-#endif
-
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Do seccomp first -- it should minimize exposure of other
@@ -344,6 +335,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned long nr = regs->orig_ax;
 
+	enter_from_user_mode();
 	local_irq_enable();
 
 	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -366,9 +358,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
- * Does a 32-bit syscall.  Called with IRQs on and does all entry and
- * exit work and returns with IRQs off.  This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
+ * all entry and exit work and returns with IRQs off.  This function is
+ * extremely hot in workloads that use it, and it's usually called from
  * do_fast_syscall_32, so forcibly inline it to improve performance.
  */
 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
@@ -409,6 +401,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 /* Handles int $0x80 */
 __visible void do_int80_syscall_32(struct pt_regs *regs)
 {
+	enter_from_user_mode();
 	local_irq_enable();
 	do_syscall_32_irqs_on(regs);
 }
@@ -431,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 	 */
 	regs->ip = landing_pad;
 
-	/*
-	 * Fetch EBP from where the vDSO stashed it.
-	 *
-	 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
-	 */
+	enter_from_user_mode();
+
 	local_irq_enable();
+
+	/* Fetch EBP from where the vDSO stashed it. */
 	if (
 #ifdef CONFIG_X86_64
 		/*
@@ -454,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 		/* User code screwed up. */
 		local_irq_disable();
 		regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
-		enter_from_user_mode();
-#endif
 		prepare_exit_to_usermode(regs);
 		return 0;	/* Keep it simple: use IRET. */
 	}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f2e2302c406f..82866697fcf1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -134,7 +134,10 @@ struct thread_info {
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
 
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
 	 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |	\
-- 
cgit 


From 844a5fe219cf472060315971e15cbf97674a3324 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Mar 2016 12:13:39 +0100
Subject: KVM: MMU: fix ept=0/pte.u=1/pte.w=0/CR0.WP=0/CR4.SMEP=1/EFER.NX=0
 combo

Yes, all of these are needed. :) This is admittedly a bit odd, but
kvm-unit-tests access.flat tests this if you run it with "-cpu host"
and of course ept=0.

KVM runs the guest with CR0.WP=1, so it must handle supervisor writes
specially when pte.u=1/pte.w=0/CR0.WP=0.  Such writes cause a fault
when U=1 and W=0 in the SPTE, but they must succeed because CR0.WP=0.
When KVM gets the fault, it sets U=0 and W=1 in the shadow PTE and
restarts execution.  This will still cause a user write to fault, while
supervisor writes will succeed.  User reads will fault spuriously now,
and KVM will then flip U and W again in the SPTE (U=1, W=0).  User reads
will be enabled and supervisor writes disabled, going back to the
originary situation where supervisor writes fault spuriously.

When SMEP is in effect, however, U=0 will enable kernel execution of
this page.  To avoid this, KVM also sets NX=1 in the shadow PTE together
with U=0.  If the guest has not enabled NX, the result is a continuous
stream of page faults due to the NX bit being reserved.

The fix is to force EFER.NX=1 even if the CPU is taking care of the EFER
switch.  (All machines with SMEP have the CPU_LOAD_IA32_EFER vm-entry
control, so they do not use user-return notifiers for EFER---if they did,
EFER.NX would be forced to the same value as the host).

There is another bug in the reserved bit check, which I've split to a
separate patch for easier application to stable kernels.

Cc: stable@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Fixes: f6577a5fa15d82217ca73c74cd2dcbc0f6c781dd
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e51493ff4f9..9bd8f44baded 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1857,26 +1857,31 @@ static void reload_tss(void)
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-	u64 guest_efer;
-	u64 ignore_bits;
+	u64 guest_efer = vmx->vcpu.arch.efer;
+	u64 ignore_bits = 0;
 
-	guest_efer = vmx->vcpu.arch.efer;
+	if (!enable_ept) {
+		/*
+		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+		 * host CPUID is more efficient than testing guest CPUID
+		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+		 */
+		if (boot_cpu_has(X86_FEATURE_SMEP))
+			guest_efer |= EFER_NX;
+		else if (!(guest_efer & EFER_NX))
+			ignore_bits |= EFER_NX;
+	}
 
 	/*
-	 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-	 * outside long mode
+	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
 	 */
-	ignore_bits = EFER_NX | EFER_SCE;
+	ignore_bits |= EFER_SCE;
 #ifdef CONFIG_X86_64
 	ignore_bits |= EFER_LMA | EFER_LME;
 	/* SCE is meaningful only in long mode on Intel */
 	if (guest_efer & EFER_LMA)
 		ignore_bits &= ~(u64)EFER_SCE;
 #endif
-	guest_efer &= ~ignore_bits;
-	guest_efer |= host_efer & ignore_bits;
-	vmx->guest_msrs[efer_offset].data = guest_efer;
-	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
 	clear_atomic_switch_msr(vmx, MSR_EFER);
 
@@ -1887,16 +1892,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	 */
 	if (cpu_has_load_ia32_efer ||
 	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-		guest_efer = vmx->vcpu.arch.efer;
 		if (!(guest_efer & EFER_LMA))
 			guest_efer &= ~EFER_LME;
 		if (guest_efer != host_efer)
 			add_atomic_switch_msr(vmx, MSR_EFER,
 					      guest_efer, host_efer);
 		return false;
-	}
+	} else {
+		guest_efer &= ~ignore_bits;
+		guest_efer |= host_efer & ignore_bits;
 
-	return true;
+		vmx->guest_msrs[efer_offset].data = guest_efer;
+		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+		return true;
+	}
 }
 
 static unsigned long segment_base(u16 selector)
-- 
cgit 


From 5f0b819995e172f48fdcd91335a2126ba7d9deae Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Mar 2016 14:28:02 +0100
Subject: KVM: MMU: fix reserved bit check for
 ept=0/CR0.WP=0/CR4.SMEP=1/EFER.NX=0

KVM has special logic to handle pages with pte.u=1 and pte.w=0 when
CR0.WP=1.  These pages' SPTEs flip continuously between two states:
U=1/W=0 (user and supervisor reads allowed, supervisor writes not allowed)
and U=0/W=1 (supervisor reads and writes allowed, user writes not allowed).

When SMEP is in effect, however, U=0 will enable kernel execution of
this page.  To avoid this, KVM also sets NX=1 in the shadow PTE together
with U=0, making the two states U=1/W=0/NX=gpte.NX and U=0/W=1/NX=1.
When guest EFER has the NX bit cleared, the reserved bit check thinks
that the latter state is invalid; teach it that the smep_andnot_wp case
will also use the NX bit of SPTEs.

Cc: stable@vger.kernel.org
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.inel.com>
Fixes: c258b62b264fdc469b6d3610a907708068145e3b
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955de5964..1e7a49bfc94f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3721,13 +3721,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
+	bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
 	/*
 	 * Passing "true" to the last argument is okay; it adds a check
 	 * on bit 8 of the SPTEs which KVM doesn't use anyway.
 	 */
 	__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
 				boot_cpu_data.x86_phys_bits,
-				context->shadow_root_level, context->nx,
+				context->shadow_root_level, uses_nx,
 				guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
 				true);
 }
-- 
cgit 


From 84477336ec03f8061ffd6908da341e063e5d6d1f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 9 Mar 2016 21:56:22 +0100
Subject: x86/delay: Avoid preemptible context checks in delay_mwaitx()

We do use this_cpu_ptr(&cpu_tss) as a cacheline-aligned, seldomly
accessed per-cpu var as the MONITORX target in delay_mwaitx(). However,
when called in preemptible context, this_cpu_ptr -> smp_processor_id() ->
debug_smp_processor_id() fires:

  BUG: using smp_processor_id() in preemptible [00000000] code: udevd/312
  caller is delay_mwaitx+0x40/0xa0

But we don't care about that check - we only need cpu_tss as a MONITORX
target and it doesn't really matter which CPU's var we're touching as
we're going idle anyway. Fix that.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: spg_linux_kernel@amd.com
Link: http://lkml.kernel.org/r/20160309205622.GG6564@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/delay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e912b2f6d36e..2f07c291dcc8 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
 		 * Use cpu_tss as a cacheline-aligned, seldomly
 		 * accessed per-cpu variable as the monitor target.
 		 */
-		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+		__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
 
 		/*
 		 * AMD, like Intel, supports the EAX hint and EAX=0xf
-- 
cgit 


From 10ee73865e9e4705ba8b3f4d6149e8e68d902bb7 Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Thu, 10 Mar 2016 20:19:58 +0800
Subject: x86/entry/traps: Show unhandled signal for i386 in do_trap()

Commit abd4f7505baf ("x86: i386-show-unhandled-signals-v3") did turn on
the showing-unhandled-signal behaviour for i386 for some exception handlers,
but for no reason do_trap() is left out (my naive guess is because turning it on
for do_trap() would be too noisy since do_trap() is shared by several exceptions).

And since the same commit make "show_unhandled_signals" a debug tunable(in
/proc/sys/debug/exception-trace), and x86 by default turning it on.

So it would be strange for i386 users who turing it on manually and expect
seeing the unhandled signal output in log, but nothing.

This patch turns it on for i386 in do_trap() as well.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Cc: dave.hansen@linux.intel.com
Cc: heukelum@fastmail.fm
Cc: jbeulich@novell.com
Cc: jdike@addtoit.com
Cc: joe@perches.com
Cc: luto@kernel.org
Link: http://lkml.kernel.org/r/1457612398-4568-1-git-send-email-nasa4836@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5fae2d840e64..1baf08187ddf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -248,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = trapnr;
 
-#ifdef CONFIG_X86_64
 	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 	    printk_ratelimit()) {
 		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -257,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 		print_vma_addr(" in ", regs->ip);
 		pr_cont("\n");
 	}
-#endif
 
 	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 }
-- 
cgit 


From 83cc54a608b66d3fffa7355503263d20f491bb44 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 2 Mar 2016 15:31:03 -0700
Subject: x86/PCI: VMD: Document code for maintainability

Comment the less obvious portion of the code for setting up memory windows,
and the platform dependency for initializing the h/w with appropriate
resources.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/vmd.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index d57e48016f15..29aebbcb510b 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -532,6 +532,23 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
 		.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
 	};
 
+	/*
+	 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
+	 * put 32-bit resources in the window.
+	 *
+	 * There's no hardware reason why a 64-bit window *couldn't*
+	 * contain a 32-bit resource, but pbus_size_mem() computes the
+	 * bridge window size assuming a 64-bit window will contain no
+	 * 32-bit resources.  __pci_assign_resource() enforces that
+	 * artificial restriction to make sure everything will fit.
+	 *
+	 * The only way we could use a 64-bit non-prefechable MEMBAR is
+	 * if its address is <4GB so that we can convert it to a 32-bit
+	 * resource.  To be visible to the host OS, all VMD endpoints must
+	 * be initially configured by platform BIOS, which includes setting
+	 * up these resources.  We can assume the device is configured
+	 * according to the platform needs.
+	 */
 	res = &vmd->dev->resource[VMD_MEMBAR1];
 	upper_bits = upper_32_bits(res->end);
 	flags = res->flags & ~IORESOURCE_SIZEALIGN;
-- 
cgit 


From d068c350c0486ba9011abb6f91cdc0c12b522155 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 2 Mar 2016 15:31:04 -0700
Subject: x86/PCI: VMD: Set bus resource start to 0

The bus always starts at 0.  Due to alignment and down-casting, this
happened to work before, but looked alarmingly incorrect in kernel logs.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/vmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index 29aebbcb510b..a08183e604d1 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -527,7 +527,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
 	res = &vmd->dev->resource[VMD_CFGBAR];
 	vmd->resources[0] = (struct resource) {
 		.name  = "VMD CFGBAR",
-		.start = res->start,
+		.start = 0,
 		.end   = (resource_size(res) >> 20) - 1,
 		.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
 	};
-- 
cgit 


From 2c2c5c5cd213aea38c850bb6edc9b7f77f29802f Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 24 Feb 2016 10:06:37 -0700
Subject: x86/PCI: VMD: Attach VMD resources to parent domain's resource tree

Attach the new VMD domain's resources to the VMD device's resources.  This
allows /proc/iomem to display a more complete picture.

Before:
  c0000000-c1ffffff : 0000:5d:05.5
  c2000000-c3ffffff : 0000:5d:05.5
    c2010000-c2013fff : nvme
  c4000000-c40fffff : 0000:5d:05.5

After:
  c0000000-c1ffffff : 0000:5d:05.5
  c2000000-c3ffffff : 0000:5d:05.5
    c2000000-c3ffffff : VMD MEMBAR1
      c2000000-c22fffff : PCI Bus 10000:01
        c2000000-c200ffff : 10000:01:00.0
        c2010000-c2013fff : 10000:01:00.0
          c2010000-c2013fff : nvme
      c2300000-c24fffff : PCI Bus 10000:01
  c4000000-c40fffff : 0000:5d:05.5
    c4002000-c40fffff : VMD MEMBAR2

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 arch/x86/pci/vmd.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index a08183e604d1..7792aba266df 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -503,6 +503,18 @@ static struct pci_ops vmd_ops = {
 	.write		= vmd_pci_write,
 };
 
+static void vmd_attach_resources(struct vmd_dev *vmd)
+{
+	vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
+	vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
+}
+
+static void vmd_detach_resources(struct vmd_dev *vmd)
+{
+	vmd->dev->resource[VMD_MEMBAR1].child = NULL;
+	vmd->dev->resource[VMD_MEMBAR2].child = NULL;
+}
+
 /*
  * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
  */
@@ -559,6 +571,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
 		.start = res->start,
 		.end   = res->end,
 		.flags = flags,
+		.parent = res,
 	};
 
 	res = &vmd->dev->resource[VMD_MEMBAR2];
@@ -571,6 +584,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
 		.start = res->start + 0x2000,
 		.end   = res->end,
 		.flags = flags,
+		.parent = res,
 	};
 
 	sd->domain = vmd_find_free_domain();
@@ -595,6 +609,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
 		return -ENODEV;
 	}
 
+	vmd_attach_resources(vmd);
 	vmd_setup_dma_ops(vmd);
 	dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
 	pci_rescan_bus(vmd->bus);
@@ -691,6 +706,7 @@ static void vmd_remove(struct pci_dev *dev)
 {
 	struct vmd_dev *vmd = pci_get_drvdata(dev);
 
+	vmd_detach_resources(vmd);
 	pci_set_drvdata(dev, NULL);
 	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
 	pci_stop_root_bus(vmd->bus);
-- 
cgit 


From 8b8addf891de8a00e4d39fc32f93f7c5eb8feceb Mon Sep 17 00:00:00 2001
From: Hector Marco-Gisbert <hecmargi@upv.es>
Date: Thu, 10 Mar 2016 20:51:00 +0100
Subject: x86/mm/32: Enable full randomization on i386 and X86_32

Currently on i386 and on X86_64 when emulating X86_32 in legacy mode, only
the stack and the executable are randomized but not other mmapped files
(libraries, vDSO, etc.). This patch enables randomization for the
libraries, vDSO and mmap requests on i386 and in X86_32 in legacy mode.

By default on i386 there are 8 bits for the randomization of the libraries,
vDSO and mmaps which only uses 1MB of VA.

This patch preserves the original randomness, using 1MB of VA out of 3GB or
4GB. We think that 1MB out of 3GB is not a big cost for having the ASLR.

The first obvious security benefit is that all objects are randomized (not
only the stack and the executable) in legacy mode which highly increases
the ASLR effectiveness, otherwise the attackers may use these
non-randomized areas. But also sensitive setuid/setgid applications are
more secure because currently, attackers can disable the randomization of
these applications by setting the ulimit stack to "unlimited". This is a
very old and widely known trick to disable the ASLR in i386 which has been
allowed for too long.

Another trick used to disable the ASLR was to set the ADDR_NO_RANDOMIZE
personality flag, but fortunately this doesn't work on setuid/setgid
applications because there is security checks which clear Security-relevant
flags.

This patch always randomizes the mmap_legacy_base address, removing the
possibility to disable the ASLR by setting the stack to "unlimited".

Signed-off-by: Hector Marco-Gisbert <hecmargi@upv.es>
Acked-by: Ismael Ripoll Ripoll <iripoll@upv.es>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1457639460-5242-1-git-send-email-hecmargi@upv.es
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/mmap.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 96bd1e2bffaf..389939f74dd5 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd)
 	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 
-/*
- * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
- * does, but not when emulating X86_32
- */
-static unsigned long mmap_legacy_base(unsigned long rnd)
-{
-	if (mmap_is_ia32())
-		return TASK_UNMAPPED_BASE;
-	else
-		return TASK_UNMAPPED_BASE + rnd;
-}
-
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (current->flags & PF_RANDOMIZE)
 		random_factor = arch_mmap_rnd();
 
-	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+	mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mm->mmap_legacy_base;
-- 
cgit 


From 0d47638f80a02b15869f1fe1fc09e5bf996750fd Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 10 Mar 2016 14:12:13 -0800
Subject: x86/mm/pkeys: Fix mismerge of protection keys CPUID bits

Kirill Shutemov pointed this out to me.

The tip tree currently has commit:

	dfb4a70f2 [x86/cpufeature, x86/mm/pkeys: Add protection keys related CPUID definitions]

whioch added support for two new CPUID bits: X86_FEATURE_PKU and
X86_FEATURE_OSPKE.  But, those bits were mis-merged and put in
cpufeature.h instead of cpufeatures.h.

This didn't cause any breakage *except* it keeps the "ospke" and
"pku" bits from showing up in cpuinfo.

Now cpuinfo has the two new flags:

	flags	: ...  pku ospke

BTW, is it really wise to have cpufeature.h and cpufeatures.h?
It seems like they can only cause confusion and mahem with tab
completion.

Reported-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160310221213.06F9DB53@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h  | 4 ----
 arch/x86/include/asm/cpufeatures.h | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 50e292a73a7b..3636ec06c887 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -94,10 +94,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
 	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
-#define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
-
 /*
  * This macro is for detection of features which need kernel
  * infrastructure to be used.  It may *not* directly test the CPU
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cbb2c568a19e..89949a299cd5 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -270,6 +270,10 @@
 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
 #define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+
 /*
  * BUG word(s)
  */
-- 
cgit 


From 6e6867093de35141f0a76b66ac13f9f2e2c8e77a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Fri, 11 Mar 2016 12:32:06 +0100
Subject: x86/fpu: Fix eager-FPU handling on legacy FPU machines

i486 derived cores like Intel Quark support only the very old,
legacy x87 FPU (FSAVE/FRSTOR, CPUID bit FXSR is not set), and
our FPU code wasn't handling the saving and restoring there
properly in the 'eagerfpu' case.

So after we made eagerfpu the default for all CPU types:

  58122bf1d856 x86/fpu: Default eagerfpu=on on all CPUs

these old FPU designs broke. First, Andy Shevchenko reported a splat:

  WARNING: CPU: 0 PID: 823 at arch/x86/include/asm/fpu/internal.h:163 fpu__clear+0x8c/0x160

which was us trying to execute FXRSTOR on those machines even though
they don't support it.

After taking care of that, Bryan O'Donoghue reported that a simple FPU
test still failed because we weren't initializing the FPU state properly
on those machines.

Take care of all that.

Reported-and-tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Reported-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160311113206.GD4312@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/core.c | 4 +++-
 arch/x86/kernel/fpu/init.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d25097c3fc1d..d5804adfa6da 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -409,8 +409,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
 {
 	if (use_xsave())
 		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-	else
+	else if (static_cpu_has(X86_FEATURE_FXSR))
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+	else
+		copy_kernel_to_fregs(&init_fpstate.fsave);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 9ee7e307b18f..bd08fb77073d 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -134,7 +134,7 @@ static void __init fpu__init_system_generic(void)
 	 * Set up the legacy init FPU context. (xstate init might overwrite this
 	 * with a more modern format, if the CPU supports it.)
 	 */
-	fpstate_init_fxstate(&init_fpstate.fxsave);
+	fpstate_init(&init_fpstate);
 
 	fpu__init_system_mxcsr();
 }
-- 
cgit 


From 452308de61056a539352a9306c46716d7af8a1f1 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 11 Mar 2016 11:19:23 +0000
Subject: x86/efi: Fix boot crash by always mapping boot service regions into
 new EFI page tables

Some machines have EFI regions in page zero (physical address
0x00000000) and historically that region has been added to the e820
map via trim_bios_range(), and ultimately mapped into the kernel page
tables. It was not mapped via efi_map_regions() as one would expect.

Alexis reports that with the new separate EFI page tables some boot
services regions, such as page zero, are not mapped. This triggers an
oops during the SetVirtualAddressMap() runtime call.

For the EFI boot services quirk on x86 we need to memblock_reserve()
boot services regions until after SetVirtualAddressMap(). Doing that
while respecting the ownership of regions that may have already been
reserved by the kernel was the motivation behind this commit:

  7d68dc3f1003 ("x86, efi: Do not reserve boot services regions within reserved areas")

That patch was merged at a time when the EFI runtime virtual mappings
were inserted into the kernel page tables as described above, and the
trick of setting ->numpages (and hence the region size) to zero to
track regions that should not be freed in efi_free_boot_services()
meant that we never mapped those regions in efi_map_regions(). Instead
we were relying solely on the existing kernel mappings.

Now that we have separate page tables we need to make sure the EFI
boot services regions are mapped correctly, even if someone else has
already called memblock_reserve(). Instead of stashing a tag in
->numpages, set the EFI_MEMORY_RUNTIME bit of ->attribute. Since it
generally makes no sense to mark a boot services region as required at
runtime, it's pretty much guaranteed the firmware will not have
already set this bit.

For the record, the specific circumstances under which Alexis
triggered this bug was that an EFI runtime driver on his machine was
responding to the EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE event during
SetVirtualAddressMap().

The event handler for this driver looks like this,

  sub rsp,0x28
  lea rdx,[rip+0x2445] # 0xaa948720
  mov ecx,0x4
  call func_aa9447c0  ; call to ConvertPointer(4, & 0xaa948720)
  mov r11,QWORD PTR [rip+0x2434] # 0xaa948720
  xor eax,eax
  mov BYTE PTR [r11+0x1],0x1
  add rsp,0x28
  ret

Which is pretty typical code for an EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE
handler. The "mov r11, QWORD PTR [rip+0x2424]" was the faulting
instruction because ConvertPointer() was being called to convert the
address 0x0000000000000000, which when converted is left unchanged and
remains 0x0000000000000000.

The output of the oops trace gave the impression of a standard NULL
pointer dereference bug, but because we're accessing physical
addresses during ConvertPointer(), it wasn't. EFI boot services code
is stored at that address on Alexis' machine.

Reported-by: Alexis Murzeau <amurzeau@gmail.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raphael Hertzog <hertzog@debian.org>
Cc: Roger Shimizu <rogershimizu@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1457695163-29632-2-git-send-email-matt@codeblueprint.co.uk
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=815125
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c | 79 +++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 2d66db8f80f9..ed30e79347e8 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -130,6 +130,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(efi_query_variable_store);
 
+/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+	if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+		return false;
+
+	if (!e820_all_mapped(start, start+size, E820_RAM))
+		return false;
+
+	return true;
+}
+
 /*
  * The UEFI specification makes it clear that the operating system is free to do
  * whatever it wants with boot services code after ExitBootServices() has been
@@ -147,26 +168,50 @@ void __init efi_reserve_boot_services(void)
 		efi_memory_desc_t *md = p;
 		u64 start = md->phys_addr;
 		u64 size = md->num_pages << EFI_PAGE_SHIFT;
+		bool already_reserved;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
-		/* Only reserve where possible:
-		 * - Not within any already allocated areas
-		 * - Not over any memory area (really needed, if above?)
-		 * - Not within any part of the kernel
-		 * - Not the bios reserved area
-		*/
-		if ((start + size > __pa_symbol(_text)
-				&& start <= __pa_symbol(_end)) ||
-			!e820_all_mapped(start, start+size, E820_RAM) ||
-			memblock_is_region_reserved(start, size)) {
-			/* Could not reserve, skip it */
-			md->num_pages = 0;
-			memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
-				     start, start+size-1);
-		} else
+
+		already_reserved = memblock_is_region_reserved(start, size);
+
+		/*
+		 * Because the following memblock_reserve() is paired
+		 * with free_bootmem_late() for this region in
+		 * efi_free_boot_services(), we must be extremely
+		 * careful not to reserve, and subsequently free,
+		 * critical regions of memory (like the kernel image) or
+		 * those regions that somebody else has already
+		 * reserved.
+		 *
+		 * A good example of a critical region that must not be
+		 * freed is page zero (first 4Kb of memory), which may
+		 * contain boot services code/data but is marked
+		 * E820_RESERVED by trim_bios_range().
+		 */
+		if (!already_reserved) {
 			memblock_reserve(start, size);
+
+			/*
+			 * If we are the first to reserve the region, no
+			 * one else cares about it. We own it and can
+			 * free it later.
+			 */
+			if (can_free_region(start, size))
+				continue;
+		}
+
+		/*
+		 * We don't own the region. We must not free it.
+		 *
+		 * Setting this bit for a boot services region really
+		 * doesn't make sense as far as the firmware is
+		 * concerned, but it does provide us with a way to tag
+		 * those regions that must not be paired with
+		 * free_bootmem_late().
+		 */
+		md->attribute |= EFI_MEMORY_RUNTIME;
 	}
 }
 
@@ -183,8 +228,8 @@ void __init efi_free_boot_services(void)
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
-		/* Could not reserve boot area */
-		if (!size)
+		/* Do not free, someone else owns it: */
+		if (md->attribute & EFI_MEMORY_RUNTIME)
 			continue;
 
 		free_bootmem_late(start, size);
-- 
cgit 


From d05004944206cbbf1c453e179768163731c7c6f1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 10 Mar 2016 19:38:18 -0800
Subject: x86/cpufeature: Enable new AVX-512 features

A few new AVX-512 instruction groups/features are added in cpufeatures.h
for enuermation: AVX512DQ, AVX512BW, and AVX512VL.

Clear the flags in fpu__xstate_clear_all_cpu_caps().

The specification for latest AVX-512 including the features can be found at:

  https://software.intel.com/sites/default/files/managed/07/b7/319433-023.pdf

Note, I didn't enable the flags in KVM. Hopefully the KVM guys can pick up
the flags and enable them in KVM.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Link: http://lkml.kernel.org/r/1457667498-37357-1-git-send-email-fenghua.yu@intel.com
[ Added more detailed feature descriptions. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h | 3 +++
 arch/x86/kernel/fpu/xstate.c       | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d11a3aaafd96..9e0567f4c081 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -220,6 +220,7 @@
 #define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ	( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
@@ -230,6 +231,8 @@
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
 #define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW	( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL	( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
 
 /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
 #define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d425cda5ae6d..6e8354f5a593 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
 	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
 }
-- 
cgit 


From 01cfbad79a5e2b835abf6a8154a341d75a6fc8cd Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:34 -0800
Subject: ipv4: Update parameters for csum_tcpudp_magic to their original types

This patch updates all instances of csum_tcpudp_magic and
csum_tcpudp_nofold to reflect the types that are usually used as the source
inputs.  For example the protocol field is populated based on nexthdr which
is actually an unsigned 8 bit value.  The length is usually populated based
on skb->len which is an unsigned integer.

This addresses an issue in which the IPv6 function csum_ipv6_magic was
generating a checksum using the full 32b of skb->len while
csum_tcpudp_magic was only using the lower 16 bits.  As a result we could
run into issues when attempting to adjust the checksum as there was no
protocol agnostic way to update it.

With this change the value is still truncated as many architectures use
"(len + proto) << 8", however this truncation only occurs for values
greater than 16776960 in length and as such is unlikely to occur as we stop
the inner headers at ~64K in size.

I did have to make a few minor changes in the arm, mn10300, nios2, and
score versions of the function in order to support these changes as they
were either using things such as an OR to combine the protocol and length,
or were using ntohs to convert the length which would have truncated the
value.

I also updated a few spots in terms of whitespace and type differences for
the addresses.  Most of this was just to make sure all of the definitions
were in sync going forward.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/checksum_32.h | 6 ++----
 arch/x86/include/asm/checksum_64.h | 8 ++++----
 arch/x86/um/asm/checksum.h         | 9 ++++-----
 3 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index f50de6951738..6f380605403d 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -112,8 +112,7 @@ static inline __sum16 csum_fold(__wsum sum)
 }
 
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	asm("addl %1, %0	;\n"
@@ -131,8 +130,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
+					__u32 len, __u8 proto,
 					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e1774491..97b98e2039bc 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
  * 32bit unfolded.
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		   __u8 proto, __wsum sum)
 {
 	asm("  addl %1, %0\n"
 	    "  adcl %2, %0\n"
@@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * complemented and ready to be filled in.
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto, __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index ee940185e89f..54d96f1e3594 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -87,8 +87,8 @@ static inline __sum16 csum_fold(__wsum sum)
  * 32bit unfolded.
  */
 static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
-		   unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+		  __u8 proto, __wsum sum)
 {
 	asm("  addl %1, %0\n"
 	    "  adcl %2, %0\n"
@@ -104,9 +104,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
  * returns a 16-bit checksum, already complemented
  */
 static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					   unsigned short len,
-					   unsigned short proto,
-					   __wsum sum)
+					__u32 len, __u8 proto,
+					__wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
 }
-- 
cgit 


From 1e94082963747b551b129528714827f76a090e93 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Fri, 11 Mar 2016 14:05:41 -0800
Subject: ipv6: Pass proto to csum_ipv6_magic as __u8 instead of unsigned short

This patch updates csum_ipv6_magic so that it correctly recognizes that
protocol is a unsigned 8 bit value.

This will allow us to better understand what limitations may or may not be
present in how we handle the data.  For example there are a number of
places that call htonl on the protocol value.  This is likely not necessary
and can be replaced with a multiplication by ntohl(1) which will be
converted to a shift by the compiler.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/checksum_32.h | 3 +--
 arch/x86/include/asm/checksum_64.h | 2 +-
 arch/x86/lib/csum-wrappers_64.c    | 2 +-
 arch/x86/um/asm/checksum_32.h      | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 6f380605403d..532f85e6651f 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -149,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 				      const struct in6_addr *daddr,
-				      __u32 len, unsigned short proto,
-				      __wsum sum)
+				      __u32 len, __u8 proto, __wsum sum)
 {
 	asm("addl 0(%1), %0	;\n"
 	    "adcl 4(%1), %0	;\n"
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 97b98e2039bc..c020ee75dce7 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -177,7 +177,7 @@ struct in6_addr;
 #define _HAVE_ARCH_IPV6_CSUM 1
 extern __sum16
 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, unsigned short proto, __wsum sum);
+		__u32 len, __u8 proto, __wsum sum);
 
 static inline unsigned add32_with_carry(unsigned a, unsigned b)
 {
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 1318f75d56e4..28a6654f0d08 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
 __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 			const struct in6_addr *daddr,
-			__u32 len, unsigned short proto, __wsum sum)
+			__u32 len, __u8 proto, __wsum sum)
 {
 	__u64 rest, sum64;
 
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index ab77b6f9a4bf..83a75f8a1233 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 					  const struct in6_addr *daddr,
-					  __u32 len, unsigned short proto,
+					  __u32 len, __u8 proto,
 					  __wsum sum)
 {
 	__asm__(
-- 
cgit 


From 743146db071c4a828159211a295d12ff4f61752f Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 15 Mar 2016 09:20:33 +0100
Subject: x86/video: Don't assume all FB devices are PCI devices

When booting Hyper-V Generation 2 guests KASAN reports the following
out-of-bounds access:

  BUG: KASAN: slab-out-of-bounds in fb_is_primary_device+0x58/0x70 at addr ffff880079cf0eb0
  Read of size 8 by task swapper/0/1
  ...
   [<ffffffff81581308>] dump_stack+0x63/0x8b
   [<ffffffff812e1f99>] print_trailer+0xf9/0x150
   [<ffffffff812e7344>] object_err+0x34/0x40
   [<ffffffff812e9630>] kasan_report_error+0x230/0x550
   [<ffffffff812e9ee8>] kasan_report+0x58/0x60
   [<ffffffff812e4500>] ? ___slab_alloc+0x80/0x490
   [<ffffffff81878a28>] ? fb_is_primary_device+0x58/0x70
   [<ffffffff812e87cd>] __asan_load8+0x5d/0x70
   [<ffffffff81878a28>] fb_is_primary_device+0x58/0x70
   [<ffffffff8162357a>] register_framebuffer+0xda/0x5b0
   [<ffffffff816234a0>] ? remove_conflicting_framebuffers+0x50/0x50
  ...

The issue is caused by the to_pci_dev() call with no check that the given
info->device is in fact a PCI device and some FB devices (Hyper-V FB, EFI
FB,...) are not.

While on it, clean up the function.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Cathy Avery <cavery@redhat.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458030033-10122-1-git-send-email-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/video/fbdev.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index d5644bbe8cba..9fd24846d094 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -14,26 +14,24 @@
 int fb_is_primary_device(struct fb_info *info)
 {
 	struct device *device = info->device;
-	struct pci_dev *pci_dev = NULL;
 	struct pci_dev *default_device = vga_default_device();
-	struct resource *res = NULL;
+	struct pci_dev *pci_dev;
+	struct resource *res;
 
-	if (device)
-		pci_dev = to_pci_dev(device);
-
-	if (!pci_dev)
+	if (!device || !dev_is_pci(device))
 		return 0;
 
+	pci_dev = to_pci_dev(device);
+
 	if (default_device) {
 		if (pci_dev == default_device)
 			return 1;
-		else
-			return 0;
+		return 0;
 	}
 
-	res = &pci_dev->resource[PCI_ROM_RESOURCE];
+	res = pci_dev->resource + PCI_ROM_RESOURCE;
 
-	if (res && res->flags & IORESOURCE_ROM_SHADOW)
+	if (res->flags & IORESOURCE_ROM_SHADOW)
 		return 1;
 
 	return 0;
-- 
cgit 


From 288cf3c64e4522d28349de5345348574cbe9df83 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 15 Mar 2016 14:57:33 -0700
Subject: x86: query dynamic DEBUG_PAGEALLOC setting

We can use debug_pagealloc_enabled() to check if we can map the identity
mapping with 2MB pages.  We can also add the state into the dump_stack
output.

The patch does not touch the code for the 1GB pages, which ignored
CONFIG_DEBUG_PAGEALLOC.  Do we need to fence this as well?

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/dumpstack.c |  5 ++---
 arch/x86/mm/init.c          |  7 ++++---
 arch/x86/mm/pageattr.c      | 14 ++++----------
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acfadae2..32e5699eadfe 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,9 +265,8 @@ int __die(const char *str, struct pt_regs *regs, long err)
 #ifdef CONFIG_SMP
 	printk("SMP ");
 #endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC ");
-#endif
+	if (debug_pagealloc_enabled())
+		printk("DEBUG_PAGEALLOC ");
 #ifdef CONFIG_KASAN
 	printk("KASAN");
 #endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f54172b4a..39823fd91396 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,13 +150,14 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK)
 	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
+	 * use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (cpu_has_pse)
+	if (cpu_has_pse && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 #endif
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1c37e650acac..e64a4703f8b1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void)
 
 #endif
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-# define debug_pagealloc 1
-#else
-# define debug_pagealloc 0
-#endif
-
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -714,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 {
 	struct page *base;
 
-	if (!debug_pagealloc)
+	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
 	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-	if (!debug_pagealloc)
+	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
 		return -ENOMEM;
@@ -1337,10 +1331,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
 			cpa->numpages = 1;
 
-		if (!debug_pagealloc)
+		if (!debug_pagealloc_enabled())
 			spin_lock(&cpa_lock);
 		ret = __change_page_attr(cpa, checkalias);
-		if (!debug_pagealloc)
+		if (!debug_pagealloc_enabled())
 			spin_unlock(&cpa_lock);
 		if (ret)
 			return ret;
-- 
cgit 


From a75e1f637cf137f82cf025321e7a53adeeed7029 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 15 Mar 2016 14:57:39 -0700
Subject: x86: also use debug_pagealloc_enabled() for free_init_pages

we want to couple all debugging features with debug_pagealloc_enabled()
and not with the config option CONFIG_DEBUG_PAGEALLOC.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Suggested-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 39823fd91396..9d56f271d519 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -667,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	 * mark them not present - any buggy init-section access will
 	 * create a kernel page fault:
 	 */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
-		begin, end - 1);
-	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-	/*
-	 * We just marked the kernel text read only above, now that
-	 * we are going to free part of that, we need to make that
-	 * writeable and non-executable first.
-	 */
-	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
-	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+	if (debug_pagealloc_enabled()) {
+		pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
+			begin, end - 1);
+		set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+	} else {
+		/*
+		 * We just marked the kernel text read only above, now that
+		 * we are going to free part of that, we need to make that
+		 * writeable and non-executable first.
+		 */
+		set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+		set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
-	free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
-#endif
+		free_reserved_area((void *)begin, (void *)end,
+				   POISON_FREE_INITMEM, what);
+	}
 }
 
 void free_initmem(void)
-- 
cgit 


From d367cef0a7f0c6ee86e997c0cb455b21b3c6b9ba Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Mon, 14 Mar 2016 10:33:01 +0000
Subject: x86/mm/pat: Fix boot crash when 1GB pages are not supported by the
 CPU

Scott reports that with the new separate EFI page tables he's seeing
the following error on boot, caused by setting reserved bits in the
page table structures (fault code is PF_RSVD | PF_PROT),

  swapper/0: Corrupted page table at address 17b102020
  PGD 17b0e5063 PUD 1400000e3
  Bad pagetable: 0009 [#1] SMP

On first inspection the PUD is using a 1GB page size (_PAGE_PSE) and
looks fine but that's only true if support for 1GB PUD pages
("pdpe1gb") is present in the CPU.

Scott's Intel Celeron N2820 does not have that feature and so the
_PAGE_PSE bit is reserved. Fix this issue by making the 1GB mapping
code in conditional on "cpu_has_gbpages".

This issue didn't come up in the past because the required mapping for
the faulting address (0x17b102020) will already have been setup by the
kernel in early boot before we got to efi_map_regions(), but we no
longer use the standard kernel page tables during EFI calls.

Reported-by: Scott Ashcroft <scott.ashcroft@talk21.com>
Tested-by: Scott Ashcroft <scott.ashcroft@talk21.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raphael Hertzog <hertzog@debian.org>
Cc: Roger Shimizu <rogershimizu@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1457951581-27353-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pageattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 14c38ae80409..fcf8e290740a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 	/*
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
-	while (end - start >= PUD_SIZE) {
+	while (cpu_has_gbpages && end - start >= PUD_SIZE) {
 		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pud_pgprot)));
 
-- 
cgit 


From cbf8b5a2b649a501758291cb4d4ba1e5711771ba Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 14 Mar 2016 15:33:39 -0700
Subject: x86/mm, x86/mce: Fix return type/value for memcpy_mcsafe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returning a 'bool' was very unpopular. Doubly so because the
code was just wrong (returning zero for true, one for false;
great for shell programming, not so good for C).

Change return type to "int". Keep zero as the success indicator
because it matches other similar code and people may be more
comfortable writing:

	if (memcpy_mcsafe(to, from, count)) {
		printk("Sad panda, copy failed\n");
		...
	}

Make the failure return value -EFAULT for now.

Reported by: Mika Penttilä <mika.penttila@nextfour.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: mika.penttila@nextfour.com
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Link: http://lkml.kernel.org/r/695f14233fa7a54fcac4406c706d7fec228e3f4c.1457993040.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/string_64.h | 4 ++--
 arch/x86/lib/memcpy_64.S         | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ca6ba3607705..90dbbd9666d4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -87,9 +87,9 @@ int strcmp(const char *cs, const char *ct);
  *
  * Low level memory copy function that catches machine checks
  *
- * Return true for success, false for fail
+ * Return 0 for success, -EFAULT for fail
  */
-bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index cbb8ee5830ff..2ec0b0abbfaa 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,6 +1,7 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
+#include <asm/errno.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
@@ -268,16 +269,16 @@ ENTRY(memcpy_mcsafe)
 	decl %ecx
 	jnz .L_copy_trailing_bytes
 
-	/* Copy successful. Return true */
+	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
 	xorq %rax, %rax
 	ret
 ENDPROC(memcpy_mcsafe)
 
 	.section .fixup, "ax"
-	/* Return false for any failure */
+	/* Return -EFAULT for any failure */
 .L_memcpy_mcsafe_fail:
-	mov	$1, %rax
+	mov	$-EFAULT, %rax
 	ret
 
 	.previous
-- 
cgit 


From b7a584598aea7ca73140cb87b40319944dd3393f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 16 Mar 2016 14:14:21 -0700
Subject: x86/iopl/64: Properly context-switch IOPL on Xen PV

On Xen PV, regs->flags doesn't reliably reflect IOPL and the
exit-to-userspace code doesn't change IOPL.  We need to context
switch it manually.

I'm doing this without going through paravirt because this is
specific to Xen PV.  After the dust settles, we can merge this with
the 32-bit code, tidy up the iopl syscall implementation, and remove
the set_iopl pvop entirely.

Fixes XSA-171.

Reviewewd-by: Jan Beulich <JBeulich@suse.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/693c3bd7aeb4d3c27c92c622b7d0f554a458173c.1458162709.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/xen/hypervisor.h |  2 ++
 arch/x86/kernel/process_64.c          | 12 ++++++++++++
 arch/x86/xen/enlighten.c              |  2 +-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 8b2d4bea9962..39171b3646bb 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num);
 void xen_arch_unregister_cpu(int num);
 #endif
 
+extern void xen_set_iopl_mask(unsigned mask);
+
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..9f751876066f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -411,6 +412,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);
 
+#ifdef CONFIG_XEN
+	/*
+	 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+	 * current_pt_regs()->flags may not match the current task's
+	 * intended IOPL.  We need to switch it manually.
+	 */
+	if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+		     prev->iopl != next->iopl))
+		xen_set_iopl_mask(next->iopl);
+#endif
+
 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 		/*
 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2c261082eadf..8381fb990c7f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -961,7 +961,7 @@ static void xen_load_sp0(struct tss_struct *tss,
 	tss->x86_tss.sp0 = thread->sp0;
 }
 
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
 {
 	struct physdev_set_iopl set_iopl;
 
-- 
cgit 


From c29016cf41fe9fa994a5ecca607cf5f1cd98801e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 16 Mar 2016 14:14:22 -0700
Subject: x86/iopl: Fix iopl capability check on Xen PV

iopl(3) is supposed to work if iopl is already 3, even if
unprivileged.  This didn't work right on Xen PV.  Fix it.

Reviewewd-by: Jan Beulich <JBeulich@suse.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/8ce12013e6e4c0a44a97e316be4a6faff31bd5ea.1458162709.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ioport.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 37dae792dbbe..589b3193f102 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 SYSCALL_DEFINE1(iopl, unsigned int, level)
 {
 	struct pt_regs *regs = current_pt_regs();
-	unsigned int old = (regs->flags >> 12) & 3;
 	struct thread_struct *t = &current->thread;
 
+	/*
+	 * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+	 * and changing them has no effect.
+	 */
+	unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
 	if (level > 3)
 		return -EINVAL;
 	/* Trying to gain more privileges? */
@@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
-	t->iopl = level << 12;
+	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+		(level << X86_EFLAGS_IOPL_BIT);
+	t->iopl = level << X86_EFLAGS_IOPL_BIT;
 	set_iopl_mask(t->iopl);
 
 	return 0;
-- 
cgit 


From 3ed3a4f0ddffece942bb2661924d87be4ce63cb7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 17 Mar 2016 14:19:11 -0700
Subject: mm: cleanup *pte_alloc* interfaces

There are few things about *pte_alloc*() helpers worth cleaning up:

 - 'vma' argument is unused, let's drop it;

 - most __pte_alloc() callers do speculative check for pmd_none(),
   before taking ptl: let's introduce pte_alloc() macro which does
   the check.

   The only direct user of __pte_alloc left is userfaultfd, which has
   different expectation about atomicity wrt pmd.

 - pte_alloc_map() and pte_alloc_map_lock() are redefined using
   pte_alloc().

[sudeep.holla@arm.com: fix build for arm64 hugetlbpage]
[sfr@canb.auug.org.au: fix arch/arm/mm/mmu.c some more]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/tboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496db434..e72a07f20b05 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 	pmd = pmd_alloc(&tboot_mm, pud, vaddr);
 	if (!pmd)
 		return -1;
-	pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
+	pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
 	if (!pte)
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
-- 
cgit 


From fe896d1878949ea92ba547587bc3075cc688fb8f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 17 Mar 2016 14:19:26 -0700
Subject: mm: introduce page reference manipulation functions

The success of CMA allocation largely depends on the success of
migration and key factor of it is page reference count.  Until now, page
reference is manipulated by direct calling atomic functions so we cannot
follow up who and where manipulate it.  Then, it is hard to find actual
reason of CMA allocation failure.  CMA allocation should be guaranteed
to succeed so finding offending place is really important.

In this patch, call sites where page reference is manipulated are
converted to introduced wrapper function.  This is preparation step to
add tracepoint to each page reference manipulation function.  With this
facility, we can easily find reason of CMA allocation failure.  There is
no functional change in this patch.

In addition, this patch also converts reference read sites.  It will
help a second step that renames page._count to something else and
prevents later attempt to direct access to it (Suggested by Andrew).

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d8bf50..f8d0b5e8bdfd 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -131,7 +131,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 {
 	VM_BUG_ON_PAGE(page != compound_head(page), page);
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
-	atomic_add(nr, &page->_count);
+	page_ref_add(page, nr);
 	SetPageReferenced(page);
 }
 
-- 
cgit 


From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 17 Mar 2016 14:23:00 -0700
Subject: param: convert some "on"/"off" users to strtobool

This changes several users of manual "on"/"off" parsing to use
strtobool.

Some side-effects:
- these uses will now parse y/n/1/0 meaningfully too
- the early_param uses will now bubble up parse errors

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Amitkumar Karwar <akarwar@marvell.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Perches <joe@perches.com>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Nishant Sarmukadam <nishants@marvell.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steve French <sfrench@samba.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/aperture_64.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e85f713641d..0a2bb1f62e72 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 	return 0;
 }
 
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
 
 static int __init parse_gart_mem(char *p)
 {
-	if (!p)
-		return -EINVAL;
-
-	if (!strncmp(p, "off", 3))
-		gart_fix_e820 = 0;
-	else if (!strncmp(p, "on", 2))
-		gart_fix_e820 = 1;
-
-	return 0;
+	return kstrtobool(p, &gart_fix_e820);
 }
 early_param("gart_fix_e820", parse_gart_mem);
 
-- 
cgit 


From 7834c10313fb823e538f2772be78edcdeed2e6e3 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Mon, 14 Mar 2016 21:20:54 -0400
Subject: x86/apic: Fix suspicious RCU usage in
 smp_trace_call_function_interrupt()

Since 4.4, I've been able to trigger this occasionally:

===============================
[ INFO: suspicious RCU usage. ]
4.5.0-rc7-think+ #3 Not tainted
Cc: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/20160315012054.GA17765@codemonkey.org.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

-------------------------------
./arch/x86/include/asm/msr-trace.h:47 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

RCU used illegally from idle CPU!
rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
no locks held by swapper/3/0.

stack backtrace:
CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.5.0-rc7-think+ #3
 ffffffff92f821e0 1f3e5c340597d7fc ffff880468e07f10 ffffffff92560c2a
 ffff880462145280 0000000000000001 ffff880468e07f40 ffffffff921376a6
 ffffffff93665ea0 0000cc7c876d28da 0000000000000005 ffffffff9383dd60
Call Trace:
 <IRQ>  [<ffffffff92560c2a>] dump_stack+0x67/0x9d
 [<ffffffff921376a6>] lockdep_rcu_suspicious+0xe6/0x100
 [<ffffffff925ae7a7>] do_trace_write_msr+0x127/0x1a0
 [<ffffffff92061c83>] native_apic_msr_eoi_write+0x23/0x30
 [<ffffffff92054408>] smp_trace_call_function_interrupt+0x38/0x360
 [<ffffffff92d1ca60>] trace_call_function_interrupt+0x90/0xa0
 <EOI>  [<ffffffff92ac5124>] ? cpuidle_enter_state+0x1b4/0x520

Move the entering_irq() call before ack_APIC_irq(), because entering_irq()
tells the RCU susbstems to end the extended quiescent state, so that the
following trace call in ack_APIC_irq() works correctly.

Suggested-by: Andi Kleen <ak@linux.intel.com>
Fixes: 4787c368a9bc "x86/tracing: Add irq_enter/exit() in smp_trace_reschedule_interrupt()"
Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 arch/x86/include/asm/apic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 0899cfc8dfe8..98f25bbafac4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -643,8 +643,8 @@ static inline void entering_irq(void)
 
 static inline void entering_ack_irq(void)
 {
-	ack_APIC_irq();
 	entering_irq();
+	ack_APIC_irq();
 }
 
 static inline void ipi_entering_ack_irq(void)
-- 
cgit 


From f508a5ba7a4570418df6cfd68fe663ffdef2be63 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2016 08:35:29 +0100
Subject: x86/tsc: Prevent NULL pointer deref in calibrate_delay_is_known()

The topology_core_cpumask is used to find a neighbour cpu in
calibrate_delay_is_known(). It might not be allocated at the first invocation
of that function on the boot cpu, when CONFIG_CPUMASK_OFFSTACK is set.

The mask is allocated later in native_smp_prepare_cpus. As a consequence the
underlying find_next_bit() call dereferences a NULL pointer.

Add a proper check to prevent this.

Fixes: c25323c07345 "x86/tsc: Use topology functions"
Reported-and-tested-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Boyer <jwboyer@fedoraproject.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603180843270.3978@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5e19d2587cc5..c9c4c7ce3eb2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1306,11 +1306,15 @@ void __init tsc_init(void)
 unsigned long calibrate_delay_is_known(void)
 {
 	int sibling, cpu = smp_processor_id();
+	struct cpumask *mask = topology_core_cpumask(cpu);
 
 	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
 		return 0;
 
-	sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+	if (!mask)
+		return 0;
+
+	sibling = cpumask_any_but(mask, cpu);
 	if (sibling < nr_cpu_ids)
 		return cpu_data(sibling).loops_per_jiffy;
 	return 0;
-- 
cgit 


From 551adc60573cb68e3d55cacca9ba1b7437313df7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Mar 2016 09:40:46 +0100
Subject: x86/irq: Cure live lock in fixup_irqs()

Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().

When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.

I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.

We have to look at two cases:

1) The move_in_progress flag of the interrupt is set

   This means the ioapic has been updated with the new vector, but it has not
   fired yet. In theory there is a race:

   set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
   			      i.e. it's raised on the old vector.

   So if the target cpu cannot handle that interrupt before the old vector is
   cleaned up, we get a spurious interrupt and in the worst case the ioapic
   irq line becomes stale, but my experiments so far have only resulted in
   spurious interrupts.

   But in case of cpu hotplug this should be a non issue because if the
   affinity update happens right before all cpus rendevouz in stop machine,
   there is no way that the interrupt can be blocked on the target cpu because
   all cpus loops first with interrupts enabled in stop machine, so the old
   vector is not yet cleaned up when the interrupt fires.

   So the only way to run into this issue is if the delivery of the interrupt
   on the apic/system bus would be delayed beyond the point where the target
   cpu disables interrupts in stop machine. I doubt that it can happen, but at
   least there is a theroretical chance. Virtualization might be able to
   expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
   hardware.

   I've spent quite some time over the weekend to enforce that situation,
   though I was not able to trigger the delayed case.

2) The move_in_progress flag is not set and the old_domain cpu mask is not
   empty.

   That means, that an interrupt was delivered after the change and the
   cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
   responded to it yet.

In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.

Fixes: 98229aa36caa "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h |  1 +
 arch/x86/kernel/apic/vector.c | 88 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 71 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index f564d2b2275f..b90e1053049b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -141,6 +141,7 @@ struct irq_alloc_info {
 struct irq_cfg {
 	unsigned int		dest_apicid;
 	u8			vector;
+	u8			old_vector;
 };
 
 extern struct irq_cfg *irq_cfg(unsigned int irq);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3b670df4ba7b..ad59d70bcb1a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -213,6 +213,7 @@ update:
 	 */
 	cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
 	d->move_in_progress = !cpumask_empty(d->old_domain);
+	d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
 	d->cfg.vector = vector;
 	cpumask_copy(d->domain, vector_cpumask);
 success:
@@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg)
 }
 
 /*
- * Called with @desc->lock held and interrupts disabled.
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
  */
 void irq_force_complete_move(struct irq_desc *desc)
 {
 	struct irq_data *irqdata = irq_desc_get_irq_data(desc);
 	struct apic_chip_data *data = apic_chip_data(irqdata);
 	struct irq_cfg *cfg = data ? &data->cfg : NULL;
+	unsigned int cpu;
 
 	if (!cfg)
 		return;
 
-	__irq_complete_move(cfg, cfg->vector);
-
 	/*
 	 * This is tricky. If the cleanup of @data->old_domain has not been
 	 * done yet, then the following setaffinity call will fail with
 	 * -EBUSY. This can leave the interrupt in a stale state.
 	 *
-	 * The cleanup cannot make progress because we hold @desc->lock. So in
-	 * case @data->old_domain is not yet cleaned up, we need to drop the
-	 * lock and acquire it again. @desc cannot go away, because the
-	 * hotplug code holds the sparse irq lock.
+	 * All CPUs are stuck in stop machine with interrupts disabled so
+	 * calling __irq_complete_move() would be completely pointless.
 	 */
 	raw_spin_lock(&vector_lock);
-	/* Clean out all offline cpus (including ourself) first. */
+	/*
+	 * Clean out all offline cpus (including the outgoing one) from the
+	 * old_domain mask.
+	 */
 	cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
-	while (!cpumask_empty(data->old_domain)) {
+
+	/*
+	 * If move_in_progress is cleared and the old_domain mask is empty,
+	 * then there is nothing to cleanup. fixup_irqs() will take care of
+	 * the stale vectors on the outgoing cpu.
+	 */
+	if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
 		raw_spin_unlock(&vector_lock);
-		raw_spin_unlock(&desc->lock);
-		cpu_relax();
-		raw_spin_lock(&desc->lock);
+		return;
+	}
+
+	/*
+	 * 1) The interrupt is in move_in_progress state. That means that we
+	 *    have not seen an interrupt since the io_apic was reprogrammed to
+	 *    the new vector.
+	 *
+	 * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+	 *    have not been processed yet.
+	 */
+	if (data->move_in_progress) {
 		/*
-		 * Reevaluate apic_chip_data. It might have been cleared after
-		 * we dropped @desc->lock.
+		 * In theory there is a race:
+		 *
+		 * set_ioapic(new_vector) <-- Interrupt is raised before update
+		 *			      is effective, i.e. it's raised on
+		 *			      the old vector.
+		 *
+		 * So if the target cpu cannot handle that interrupt before
+		 * the old vector is cleaned up, we get a spurious interrupt
+		 * and in the worst case the ioapic irq line becomes stale.
+		 *
+		 * But in case of cpu hotplug this should be a non issue
+		 * because if the affinity update happens right before all
+		 * cpus rendevouz in stop machine, there is no way that the
+		 * interrupt can be blocked on the target cpu because all cpus
+		 * loops first with interrupts enabled in stop machine, so the
+		 * old vector is not yet cleaned up when the interrupt fires.
+		 *
+		 * So the only way to run into this issue is if the delivery
+		 * of the interrupt on the apic/system bus would be delayed
+		 * beyond the point where the target cpu disables interrupts
+		 * in stop machine. I doubt that it can happen, but at least
+		 * there is a theroretical chance. Virtualization might be
+		 * able to expose this, but AFAICT the IOAPIC emulation is not
+		 * as stupid as the real hardware.
+		 *
+		 * Anyway, there is nothing we can do about that at this point
+		 * w/o refactoring the whole fixup_irq() business completely.
+		 * We print at least the irq number and the old vector number,
+		 * so we have the necessary information when a problem in that
+		 * area arises.
 		 */
-		data = apic_chip_data(irqdata);
-		if (!data)
-			return;
-		raw_spin_lock(&vector_lock);
+		pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+			irqdata->irq, cfg->old_vector);
 	}
+	/*
+	 * If old_domain is not empty, then other cpus still have the irq
+	 * descriptor set in their vector array. Clean it up.
+	 */
+	for_each_cpu(cpu, data->old_domain)
+		per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+
+	/* Cleanup the left overs of the (half finished) move */
+	cpumask_clear(data->old_domain);
+	data->move_in_progress = 0;
 	raw_spin_unlock(&vector_lock);
 }
 #endif
-- 
cgit 


From 9d2099ab054558af0b3d4860b68a11aff420aa40 Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Mon, 28 Dec 2015 16:35:07 +0800
Subject: x86: ftrace: Fix the misleading comment for arch/x86/kernel/ftrace.c

Fix the misleading comment for arch/x86/kernel/ftrace.c that it
had used nop instead of jmp.

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 29408d6d6626..1b7d7e4fd7b0 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1,5 +1,5 @@
 /*
- * Code for replacing ftrace calls with jumps.
+ * Dynamic function tracing support.
  *
  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
  *
-- 
cgit 


From b5d5f27d938fb6fc8d3202704e699d2694a02da6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Mar 2016 16:03:46 +0100
Subject: x86/topology: Fix logical package mapping

That first branch testing pkg against __max_logical_packages is wrong,
because if the first pkg id is larger, then the find_first_zero will
find us logical package id 0. However, if the second pkg id is indeed
0, we'll again claim it without testing if it was already taken.

Also, it fails to print the mapping.

Fixes: 1f12e32f4cd5 ("x86/topology: Create logical package id")
Reported-by: Xiong Zhou <jencce.kernel@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: aherrmann@suse.com
Cc: bp@alien8.de
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Link: http://lkml.kernel.org/r/20160317095220.GO6344@twins.programming.kicks-ass.net
Link: http://lkml.kernel.org/r/20160318150538.482393396@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 643dbdccf4bc..64b669dcbf23 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -274,11 +274,6 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
 	if (test_and_set_bit(pkg, physical_package_map))
 		goto found;
 
-	if (pkg < __max_logical_packages) {
-		set_bit(pkg, logical_package_map);
-		physical_to_logical_pkg[pkg] = pkg;
-		goto found;
-	}
 	new = find_first_zero_bit(logical_package_map, __max_logical_packages);
 	if (new >= __max_logical_packages) {
 		physical_to_logical_pkg[pkg] = -1;
-- 
cgit 


From 63d1e995be455ae9196270eb4b789de21afd42ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Mar 2016 16:03:48 +0100
Subject: x86/topology: Fix Intel HT disable

As per the comment in the code; due to BIOS it is sometimes impossible to know
if there actually are smp siblings until the machine is fully enumerated. So
we rather overestimate the number of possible packages.

Fixes: 1f12e32f4cd5 ("x86/topology: Create logical package id")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: aherrmann@suse.com
Cc: jencce.kernel@gmail.com
Cc: bp@alien8.de
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Link: http://lkml.kernel.org/r/20160318150538.611014173@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 64b669dcbf23..81e6a432f23c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -312,8 +312,21 @@ static void __init smp_init_package_map(void)
 	/*
 	 * Today neither Intel nor AMD support heterogenous systems. That
 	 * might change in the future....
+	 *
+	 * While ideally we'd want '* smp_num_siblings' in the below @ncpus
+	 * computation, this won't actually work since some Intel BIOSes
+	 * report inconsistent HT data when they disable HT.
+	 *
+	 * In particular, they reduce the APIC-IDs to only include the cores,
+	 * but leave the CPUID topology to say there are (2) siblings.
+	 * This means we don't know how many threads there will be until
+	 * after the APIC enumeration.
+	 *
+	 * By not including this we'll sometimes over-estimate the number of
+	 * logical packages by the amount of !present siblings, but this is
+	 * still better than MAX_LOCAL_APIC.
 	 */
-	ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+	ncpus = boot_cpu_data.x86_max_cores;
 	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
 
 	/*
-- 
cgit 


From 3e8db2246b434c6b18a6a9f09904038bddcf76c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2016 17:20:30 +0100
Subject: x86/topology: Use total_cpus not nr_cpu_ids for logical packages

nr_cpu_ids can be limited on the command line via nr_cpus=. That can break the
logical package management because it results in a smaller number of packages,
but the cpus to online are occupying the full package space as the hyper
threads are enumerated after the physical cores typically.

total_cpus is the real possible cpu space not limited by nr_cpus command line
and gives us the proper number of packages.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Fixes: 1f12e32f4cd5 ("x86/topology: Create logical package id")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiong Zhou <jencce.kernel@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andreas Herrmann <aherrmann@suse.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603181254330.3978@nanos
---
 arch/x86/kernel/smpboot.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 81e6a432f23c..b2c99f811c3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -325,9 +325,14 @@ static void __init smp_init_package_map(void)
 	 * By not including this we'll sometimes over-estimate the number of
 	 * logical packages by the amount of !present siblings, but this is
 	 * still better than MAX_LOCAL_APIC.
+	 *
+	 * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited
+	 * on the command line leading to a similar issue as the HT disable
+	 * problem because the hyperthreads are usually enumerated after the
+	 * primary cores.
 	 */
 	ncpus = boot_cpu_data.x86_max_cores;
-	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+	__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
 
 	/*
 	 * Possibly larger than what we need as the number of apic ids per
-- 
cgit 


From a38f98735e168a20573c24dfffa96095b6fe1d23 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Mar 2016 11:41:42 +0100
Subject: x86/apb/timer: Use proper mask to modify hotplug action

Magic hex constants are a guarantee for wreckage when the defines change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apb_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 222a57076039..cefacbad1531 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -221,7 +221,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 	unsigned long cpu = (unsigned long)hcpu;
 	struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
 
-	switch (action & 0xf) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DEAD:
 		dw_apb_clockevent_pause(adev->timer);
 		if (system_state == SYSTEM_RUNNING) {
-- 
cgit 


From f47ab81aca8264adae25005f0130db60c0af7797 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Mar 2016 11:47:39 +0100
Subject: x86/apic/uv: Fix the hotplug notifier

The notifier is missing the CPU_DOWN_FAILED transition. That leaves the
heartbeat disabled when CPU_DOWN_PREPARE fails.

It also does not handle the FROZEN transition variants. That might not be an
issue for UV, but it's inconsistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 278070381aee..8f4942e2bcbb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -792,7 +792,8 @@ static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
 {
 	long cpu = (long)hcpu;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		uv_heartbeat_enable(cpu);
 		break;
-- 
cgit 


From f80be5e3d5aaea26c7351f4f08ab67fc4325a4ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Mar 2016 12:17:24 +0100
Subject: x86/hpet: Use proper mask to modify hotplug action

Magic hex constants are a guarantee for wreckage when the defines change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index be0ebbb6d1d1..a1f0e4a5c47e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -717,7 +717,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 	struct hpet_work_struct work;
 	struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
 
-	switch (action & 0xf) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 		INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
 		init_completion(&work.complete);
-- 
cgit 


From 57d335ce88d055eb212e2531dd7a8b4240404a57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Mar 2016 12:28:38 +0100
Subject: x86/oprofile/nmi: Add missing hotplug FROZEN handling

We really do not want to keep that nmi enabled across suspend/resume.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/oprofile/nmi_int.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1d2e6392f5fa..0e07e0968c3a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -437,7 +437,8 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 				 void *data)
 {
 	int cpu = (unsigned long)data;
-	switch (action) {
+
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
-- 
cgit 


From 142b9e6c9de0fd7c0dff9a1d4a25390de46abf5e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Mar 2016 10:04:37 +0100
Subject: x86/kallsyms: fix GOLD link failure with new relative kallsyms table
 format

Commit 2213e9a66bb8 ("kallsyms: add support for relative offsets in
kallsyms address table") changed the default kallsyms symbol table
format to use relative references rather than absolute addresses.

This reduces the size of the kallsyms symbol table by 50% on 64-bit
architectures, and further reduces the size of the relocation tables
used by relocatable kernels.  Since the memory footprint of the static
kernel image is always much smaller than 4 GB, these relative references
are assumed to be representable in 32 bits, even when the native word
size is 64 bits.

On 64-bit architectures, this obviously only works if the distance
between each relative reference and the chosen anchor point is
representable in 32 bits, and so the table generation code in
scripts/kallsyms.c scans the table for the lowest value that is covered
by the kernel text, and selects it as the anchor point.

However, when using the GOLD linker rather than the default BFD linker
to build the x86_64 kernel, the symbol phys_offset_64, which is the
result of arithmetic defined in the linker script, is emitted as a 'T'
rather than an 'A' type symbol, resulting in scripts/kallsyms.c to
mistake it for a suitable anchor point, even though it is far away from
the actual kernel image in the virtual address space.  This results in
out-of-range warnings from scripts/kallsyms.c and a broken build.

So let's align with the BFD linker, and emit the phys_offset_[32|64]
symbols as absolute symbols explicitly.  Note that the out of range
issue does not exist on 32-bit x86, but this patch changes both symbols
for symmetry.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vmlinux.lds.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 5af9958cbdb6..6bb070e54fda 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -81,11 +81,11 @@ PHDRS {
 SECTIONS
 {
 #ifdef CONFIG_X86_32
-        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-        phys_startup_32 = startup_32 - LOAD_OFFSET;
+	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+	phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
 #else
-        . = __START_KERNEL;
-        phys_startup_64 = startup_64 - LOAD_OFFSET;
+	. = __START_KERNEL;
+	phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
 #endif
 
 	/* Text and read-only data */
-- 
cgit 


From 0158b83f7508851cda9e62c4b14678c5bd492cb2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Mar 2016 15:23:46 +0100
Subject: perf/x86/ibs: Fix IBS throttle

When the IBS IRQ handler get a !0 return from perf_event_overflow;
meaning it should throttle the event, it only disables it, it doesn't
call perf_ibs_stop().

This confuses the state machine, as we'll use pmu::start() ->
perf_ibs_start() to unthrottle.

Tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: dvyukov@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/20160311142346.GE6344@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/ibs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 51087c29b2c2..7956d29762ef 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -599,7 +599,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	throttle = perf_event_overflow(event, &data, &regs);
 out:
 	if (throttle)
-		perf_ibs_disable_event(perf_ibs, hwc, *config);
+		perf_ibs_stop(event, 0);
 	else
 		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 
-- 
cgit 


From 5a50f52917011365a4a2c0f1d2cea0990c4465cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Mar 2016 23:55:21 +0100
Subject: perf/x86/ibs: Fix race with IBS_STARTING state

While tracing the IBS bits I saw the NMI hitting between clearing
IBS_STARTING and the actual MSR writes to disable the counter.

Since IBS_STARTING was cleared, the handler assumed these were spurious
NMIs and because STOPPING wasn't set yet either, insta-triggered an
"Unknown NMI".

Cure this by clearing IBS_STARTING after disabling the hardware.

Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/ibs.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 7956d29762ef..c43a36d5fd6f 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags)
 	hwc->state = 0;
 
 	perf_ibs_set_period(perf_ibs, hwc, &period);
+	/*
+	 * Set STARTED before enabling the hardware, such that
+	 * a subsequent NMI must observe it. Then clear STOPPING
+	 * such that we don't consume NMIs by accident.
+	 */
 	set_bit(IBS_STARTED, pcpu->state);
+	clear_bit(IBS_STOPPING, pcpu->state);
 	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
 
 	perf_event_update_userpage(event);
@@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	u64 config;
 	int stopping;
 
-	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+	stopping = test_bit(IBS_STARTED, pcpu->state);
 
 	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
 		return;
@@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
 	rdmsrl(hwc->config_base, config);
 
 	if (stopping) {
+		/*
+		 * Set STOPPING before disabling the hardware, such that it
+		 * must be visible to NMIs the moment we clear the EN bit,
+		 * at which point we can generate an !VALID sample which
+		 * we need to consume.
+		 */
 		set_bit(IBS_STOPPING, pcpu->state);
 		perf_ibs_disable_event(perf_ibs, hwc, config);
+		/*
+		 * Clear STARTED after disabling the hardware; if it were
+		 * cleared before an NMI hitting after the clear but before
+		 * clearing the EN bit might think it a spurious NMI and not
+		 * handle it.
+		 *
+		 * Clearing it after, however, creates the problem of the NMI
+		 * handler seeing STARTED but not having a valid sample.
+		 */
+		clear_bit(IBS_STARTED, pcpu->state);
 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 		hwc->state |= PERF_HES_STOPPED;
 	}
@@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	u64 *buf, *config, period;
 
 	if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
 		/*
 		 * Catch spurious interrupts after stopping IBS: After
 		 * disabling IBS there could be still incoming NMIs
 		 * with samples that even have the valid bit cleared.
 		 * Mark all this NMIs as handled.
 		 */
-		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+		if (test_and_clear_bit(IBS_STOPPING, pcpu->state))
+			return 1;
+
+		return 0;
 	}
 
 	msr = hwc->config_base;
 	buf = ibs_data.regs;
 	rdmsrl(msr, *buf);
 	if (!(*buf++ & perf_ibs->valid_mask))
-		return 0;
+		goto fail;
 
 	config = &ibs_data.regs[0];
 	perf_ibs_event_update(perf_ibs, event, config);
-- 
cgit 


From c2872d381f1ab1bc39c7f044644bf1d3da7c11dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 17 Mar 2016 15:06:58 +0100
Subject: perf/x86/ibs: Add IBS interrupt to the dynamic throttle

Interrupt throttling is normally only done against
sysctl_perf_event_sample_rate. This means that if that number is too
high (for whatever reason) you can lock up your machine.

We have, however, a dynamic throttling scheme too, but for that to
work, we need to add a callback to the interrupt handler, IBS did not
have this, so add it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/ibs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index c43a36d5fd6f..3ea25c3917c0 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -637,6 +637,7 @@ out:
 static int
 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
+	u64 stamp = sched_clock();
 	int handled = 0;
 
 	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
@@ -645,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	if (handled)
 		inc_irq_stat(apic_perf_irqs);
 
+	perf_sample_event_took(sched_clock() - stamp);
+
 	return handled;
 }
 NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-- 
cgit 


From e8d8a90fc5014e4cdcf1de2f320081d26c7f440f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Mar 2016 17:31:27 +0100
Subject: perf/x86/BTS: Fix RCU usage

This splat reminds us:

[ 8166.045595] [ INFO: suspicious RCU usage. ]

[ 8166.168972]  [<ffffffff81127837>] lockdep_rcu_suspicious+0xe7/0x120
[ 8166.175966]  [<ffffffff811e0bae>] perf_callchain+0x23e/0x250
[ 8166.182280]  [<ffffffff811dda3d>] perf_prepare_sample+0x27d/0x350
[ 8166.189082]  [<ffffffff8100f503>] intel_pmu_drain_bts_buffer+0x133/0x200

... that as the core code does, one should hold rcu_read_lock() over that
entire BTS event-output generation sequence as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ce7211a07c0b..8584b90d8e0b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void)
 	 * We will overwrite the from and to address before we output
 	 * the sample.
 	 */
+	rcu_read_lock();
 	perf_prepare_sample(&header, &data, event, &regs);
 
 	if (perf_output_begin(&handle, event, header.size *
 			      (top - base - skip)))
-		return 1;
+		goto unlock;
 
 	for (at = base; at < top; at++) {
 		/* Filter out any records that contain kernel addresses. */
@@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void)
 	/* There's new data available. */
 	event->hw.interrupts++;
 	event->pending_kill = POLL_IN;
+unlock:
+	rcu_read_unlock();
 	return 1;
 }
 
-- 
cgit 


From a223c1c7ab4cc64537dc4b911f760d851683768a Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Thu, 10 Mar 2016 15:32:07 -0800
Subject: perf/x86/cqm: Fix CQM handling of grouping events into a cache_group

Currently CQM (cache quality of service monitoring) is grouping all
events belonging to same PID to use one RMID. However its not counting
all of these different events. Hence we end up with a count of zero
for all events other than the group leader.

The patch tries to address the issue by keeping a flag in the
perf_event.hw which has other CQM related fields. The field is updated
at event creation and during grouping.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
[peterz: Changed hw_perf_event::is_group_event to an int]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/1457652732-4499-2-git-send-email-vikas.shivappa@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 93cb412a5579..b0226f1912a2 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -281,9 +281,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 
 	/*
 	 * Events that target same task are placed into the same cache group.
+	 * Mark it as a multi event group, so that we update ->count
+	 * for every event rather than just the group leader later.
 	 */
-	if (a->hw.target == b->hw.target)
+	if (a->hw.target == b->hw.target) {
+		b->hw.is_group_event = true;
 		return true;
+	}
 
 	/*
 	 * Are we an inherited event?
@@ -849,6 +853,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 	bool conflict = false;
 	u32 rmid;
 
+	event->hw.is_group_event = false;
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		rmid = iter->hw.cqm_rmid;
 
@@ -940,7 +945,9 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 		return __perf_event_count(event);
 
 	/*
-	 * Only the group leader gets to report values. This stops us
+	 * Only the group leader gets to report values except in case of
+	 * multiple events in the same group, we still need to read the
+	 * other events.This stops us
 	 * reporting duplicate values to userspace, and gives us a clear
 	 * rule for which task gets to report the values.
 	 *
@@ -948,7 +955,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	 * specific packages - we forfeit that ability when we create
 	 * task events.
 	 */
-	if (!cqm_group_leader(event))
+	if (!cqm_group_leader(event) && !event->hw.is_group_event)
 		return 0;
 
 	/*
-- 
cgit 


From ada2f634cd50d050269b67b4e2966582387e7c27 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Thu, 10 Mar 2016 15:32:08 -0800
Subject: perf/x86/cqm: Fix CQM memory leak and notifier leak

Fixes the hotcpu notifier leak and other global variable memory leaks
during CQM (cache quality of service monitoring) initialization.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/1457652732-4499-3-git-send-email-vikas.shivappa@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index b0226f1912a2..dbb058d2910a 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -211,6 +211,20 @@ static void __put_rmid(u32 rmid)
 	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
 }
 
+static void cqm_cleanup(void)
+{
+	int i;
+
+	if (!cqm_rmid_ptrs)
+		return;
+
+	for (i = 0; i < cqm_max_rmid; i++)
+		kfree(cqm_rmid_ptrs[i]);
+
+	kfree(cqm_rmid_ptrs);
+	cqm_rmid_ptrs = NULL;
+}
+
 static int intel_cqm_setup_rmid_cache(void)
 {
 	struct cqm_rmid_entry *entry;
@@ -218,7 +232,7 @@ static int intel_cqm_setup_rmid_cache(void)
 	int r = 0;
 
 	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+	cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
 				nr_rmids, GFP_KERNEL);
 	if (!cqm_rmid_ptrs)
 		return -ENOMEM;
@@ -249,11 +263,9 @@ static int intel_cqm_setup_rmid_cache(void)
 	mutex_unlock(&cache_mutex);
 
 	return 0;
-fail:
-	while (r--)
-		kfree(cqm_rmid_ptrs[r]);
 
-	kfree(cqm_rmid_ptrs);
+fail:
+	cqm_cleanup();
 	return -ENOMEM;
 }
 
@@ -1312,7 +1324,7 @@ static const struct x86_cpu_id intel_cqm_match[] = {
 
 static int __init intel_cqm_init(void)
 {
-	char *str, scale[20];
+	char *str = NULL, scale[20];
 	int i, cpu, ret;
 
 	if (!x86_match_cpu(intel_cqm_match))
@@ -1372,16 +1384,25 @@ static int __init intel_cqm_init(void)
 		cqm_pick_event_reader(i);
 	}
 
-	__perf_cpu_notifier(intel_cqm_cpu_notifier);
-
 	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret)
+	if (ret) {
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
-	else
-		pr_info("Intel CQM monitoring enabled\n");
+		goto out;
+	}
 
+	pr_info("Intel CQM monitoring enabled\n");
+
+	/*
+	 * Register the hot cpu notifier once we are sure cqm
+	 * is enabled to avoid notifier leak.
+	 */
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 out:
 	cpu_notifier_register_done();
+	if (ret) {
+		kfree(str);
+		cqm_cleanup();
+	}
 
 	return ret;
 }
-- 
cgit 


From 33c3cc7acfd95968d74247f1a4e1b0727a07ed43 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Thu, 10 Mar 2016 15:32:09 -0800
Subject: perf/x86/mbm: Add Intel Memory B/W Monitoring enumeration and init

The MBM init patch enumerates the Intel MBM (Memory b/w monitoring)
and initializes the perf events and datastructures for monitoring the
memory b/w.

Its based on original patch series by Tony Luck and Kanaka Juvva.

Memory bandwidth monitoring (MBM) provides OS/VMM a way to monitor
bandwidth from one level of cache to another. The current patches
support L3 external bandwidth monitoring. It supports both 'local
bandwidth' and 'total bandwidth' monitoring for the socket. Local
bandwidth measures the amount of data sent through the memory controller
on the socket and total b/w measures the total system bandwidth.

Extending the cache quality of service monitoring (CQM) we add two
more events to the perf infrastructure:

  intel_cqm_llc/local_bytes - bytes sent through local socket memory controller
  intel_cqm_llc/total_bytes - total L3 external bytes sent

The tasks are associated with a Resouce Monitoring ID (RMID) just like
in CQM and OS uses a MSR write to indicate the RMID of the task during
scheduling.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/1457652732-4499-4-git-send-email-vikas.shivappa@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c        | 126 ++++++++++++++++++++++++++++++++++++-
 arch/x86/include/asm/cpufeatures.h |   2 +
 arch/x86/kernel/cpu/common.c       |   4 +-
 3 files changed, 128 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index dbb058d2910a..515df11e65bb 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -15,6 +15,7 @@
 
 static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+static bool cqm_enabled, mbm_enabled;
 
 /**
  * struct intel_pqr_state - State cache for the PQR MSR
@@ -42,6 +43,24 @@ struct intel_pqr_state {
  * interrupts disabled, which is sufficient for the protection.
  */
 static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+/**
+ * struct sample - mbm event's (local or total) data
+ * @total_bytes    #bytes since we began monitoring
+ * @prev_msr       previous value of MSR
+ */
+struct sample {
+	u64	total_bytes;
+	u64	prev_msr;
+};
+
+/*
+ * samples profiled for total memory bandwidth type events
+ */
+static struct sample *mbm_total;
+/*
+ * samples profiled for local memory bandwidth type events
+ */
+static struct sample *mbm_local;
 
 /*
  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -223,6 +242,7 @@ static void cqm_cleanup(void)
 
 	kfree(cqm_rmid_ptrs);
 	cqm_rmid_ptrs = NULL;
+	cqm_enabled = false;
 }
 
 static int intel_cqm_setup_rmid_cache(void)
@@ -1164,6 +1184,16 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
 EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
 EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
 
+EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
+EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
+EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
+EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
+
+EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
+EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
+EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
+EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
+
 static struct attribute *intel_cqm_events_attr[] = {
 	EVENT_PTR(intel_cqm_llc),
 	EVENT_PTR(intel_cqm_llc_pkg),
@@ -1173,9 +1203,38 @@ static struct attribute *intel_cqm_events_attr[] = {
 	NULL,
 };
 
+static struct attribute *intel_mbm_events_attr[] = {
+	EVENT_PTR(intel_cqm_total_bytes),
+	EVENT_PTR(intel_cqm_local_bytes),
+	EVENT_PTR(intel_cqm_total_bytes_pkg),
+	EVENT_PTR(intel_cqm_local_bytes_pkg),
+	EVENT_PTR(intel_cqm_total_bytes_unit),
+	EVENT_PTR(intel_cqm_local_bytes_unit),
+	EVENT_PTR(intel_cqm_total_bytes_scale),
+	EVENT_PTR(intel_cqm_local_bytes_scale),
+	NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_total_bytes),
+	EVENT_PTR(intel_cqm_local_bytes),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_total_bytes_pkg),
+	EVENT_PTR(intel_cqm_local_bytes_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_total_bytes_unit),
+	EVENT_PTR(intel_cqm_local_bytes_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_total_bytes_scale),
+	EVENT_PTR(intel_cqm_local_bytes_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
 static struct attribute_group intel_cqm_events_group = {
 	.name = "events",
-	.attrs = intel_cqm_events_attr,
+	.attrs = NULL,
 };
 
 PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1322,12 +1381,57 @@ static const struct x86_cpu_id intel_cqm_match[] = {
 	{}
 };
 
+static void mbm_cleanup(void)
+{
+	if (!mbm_enabled)
+		return;
+
+	kfree(mbm_local);
+	kfree(mbm_total);
+	mbm_enabled = false;
+}
+
+static const struct x86_cpu_id intel_mbm_local_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+	{}
+};
+
+static const struct x86_cpu_id intel_mbm_total_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
+	{}
+};
+
+static int intel_mbm_init(void)
+{
+	int array_size, maxid = cqm_max_rmid + 1;
+
+	array_size = sizeof(struct sample) * maxid * topology_max_packages();
+	mbm_local = kmalloc(array_size, GFP_KERNEL);
+	if (!mbm_local)
+		return -ENOMEM;
+
+	mbm_total = kmalloc(array_size, GFP_KERNEL);
+	if (!mbm_total) {
+		mbm_cleanup();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static int __init intel_cqm_init(void)
 {
 	char *str = NULL, scale[20];
 	int i, cpu, ret;
 
-	if (!x86_match_cpu(intel_cqm_match))
+	if (x86_match_cpu(intel_cqm_match))
+		cqm_enabled = true;
+
+	if (x86_match_cpu(intel_mbm_local_match) &&
+	     x86_match_cpu(intel_mbm_total_match))
+		mbm_enabled = true;
+
+	if (!cqm_enabled && !mbm_enabled)
 		return -ENODEV;
 
 	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
@@ -1384,13 +1488,28 @@ static int __init intel_cqm_init(void)
 		cqm_pick_event_reader(i);
 	}
 
+	if (mbm_enabled)
+		ret = intel_mbm_init();
+	if (ret && !cqm_enabled)
+		goto out;
+
+	if (cqm_enabled && mbm_enabled)
+		intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
+	else if (!cqm_enabled && mbm_enabled)
+		intel_cqm_events_group.attrs = intel_mbm_events_attr;
+	else if (cqm_enabled && !mbm_enabled)
+		intel_cqm_events_group.attrs = intel_cqm_events_attr;
+
 	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
 	if (ret) {
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 		goto out;
 	}
 
-	pr_info("Intel CQM monitoring enabled\n");
+	if (cqm_enabled)
+		pr_info("Intel CQM monitoring enabled\n");
+	if (mbm_enabled)
+		pr_info("Intel MBM enabled\n");
 
 	/*
 	 * Register the hot cpu notifier once we are sure cqm
@@ -1402,6 +1521,7 @@ out:
 	if (ret) {
 		kfree(str);
 		cqm_cleanup();
+		mbm_cleanup();
 	}
 
 	return ret;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 074b7604bd51..746dd6ae4932 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -245,6 +245,8 @@
 
 /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
 #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
 #define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 62590aa064c8..e601c1286e29 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -649,7 +649,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
 			c->x86_capability[CPUID_F_1_EDX] = edx;
 
-			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+			if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
+			      ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
+			       (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
 				c->x86_cache_max_rmid = ecx;
 				c->x86_cache_occ_scale = ebx;
 			}
-- 
cgit 


From 87f01cc2a2914b61ade5ec834377fa7819484173 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Fri, 11 Mar 2016 11:26:11 -0800
Subject: perf/x86/mbm: Add memory bandwidth monitoring event management

Includes all the core infrastructure to measure the total_bytes and
bandwidth.

We have per socket counters for both total system wide L3 external
bytes and local socket memory-controller bytes. The OS does MSR writes
to MSR_IA32_QM_EVTSEL and MSR_IA32_QM_CTR to read the counters and
uses the IA32_PQR_ASSOC_MSR to associate the RMID with the task. The
tasks have a common RMID for CQM (cache quality of service monitoring)
and MBM. Hence most of the scheduling code is reused from CQM.

Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Restructured rmid_read to not have an obvious hole, removed MBM_CNTR_MAX as its unused. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/abd7aac9a18d93b95b985b931cf258df0164746d.1457723885.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 122 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 116 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 515df11e65bb..610bd8ab37e4 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -13,6 +13,8 @@
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
+#define MBM_CNTR_WIDTH		24
+
 static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 static bool cqm_enabled, mbm_enabled;
@@ -62,6 +64,16 @@ static struct sample *mbm_total;
  */
 static struct sample *mbm_local;
 
+#define pkg_id	topology_physical_package_id(smp_processor_id())
+/*
+ * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
+ * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
+ * rmids per socket, an example is given below
+ * RMID1 of Socket0:  vrmid =  1
+ * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
+ * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
+ */
+#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
 /*
  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
  * Also protects event->hw.cqm_rmid
@@ -84,9 +96,13 @@ static cpumask_t cqm_cpumask;
 #define RMID_VAL_ERROR		(1ULL << 63)
 #define RMID_VAL_UNAVAIL	(1ULL << 62)
 
-#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
-
-#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID	0x01
+#define QOS_MBM_TOTAL_EVENT_ID	0x02
+#define QOS_MBM_LOCAL_EVENT_ID	0x03
 
 /*
  * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
@@ -428,10 +444,17 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 
 struct rmid_read {
 	u32 rmid;
+	u32 evt_type;
 	atomic64_t value;
 };
 
 static void __intel_cqm_event_count(void *info);
+static void init_mbm_sample(u32 rmid, u32 evt_type);
+
+static bool is_mbm_event(int e)
+{
+	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
+}
 
 /*
  * Exchange the RMID of a group of events.
@@ -873,6 +896,68 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
 	schedule_delayed_work(&intel_cqm_rmid_work, delay);
 }
 
+static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
+{
+	struct sample *mbm_current;
+	u32 vrmid = rmid_2_index(rmid);
+	u64 val, bytes, shift;
+	u32 eventid;
+
+	if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
+		mbm_current = &mbm_local[vrmid];
+		eventid     = QOS_MBM_LOCAL_EVENT_ID;
+	} else {
+		mbm_current = &mbm_total[vrmid];
+		eventid     = QOS_MBM_TOTAL_EVENT_ID;
+	}
+
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return mbm_current->total_bytes;
+
+	if (first) {
+		mbm_current->prev_msr = val;
+		mbm_current->total_bytes = 0;
+		return mbm_current->total_bytes;
+	}
+
+	shift = 64 - MBM_CNTR_WIDTH;
+	bytes = (val << shift) - (mbm_current->prev_msr << shift);
+	bytes >>= shift;
+
+	bytes *= cqm_l3_scale;
+
+	mbm_current->total_bytes += bytes;
+	mbm_current->prev_msr = val;
+
+	return mbm_current->total_bytes;
+}
+
+static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
+{
+	return update_sample(rmid, evt_type, 0);
+}
+
+static void __intel_mbm_event_init(void *info)
+{
+	struct rmid_read *rr = info;
+
+	update_sample(rr->rmid, rr->evt_type, 1);
+}
+
+static void init_mbm_sample(u32 rmid, u32 evt_type)
+{
+	struct rmid_read rr = {
+		.rmid = rmid,
+		.evt_type = evt_type,
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/* on each socket, init sample */
+	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
+}
+
 /*
  * Find a group and setup RMID.
  *
@@ -893,6 +978,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
 			/* All tasks in a group share an RMID */
 			event->hw.cqm_rmid = rmid;
 			*group = iter;
+			if (is_mbm_event(event->attr.config))
+				init_mbm_sample(rmid, event->attr.config);
 			return;
 		}
 
@@ -909,6 +996,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
 	else
 		rmid = __get_rmid();
 
+	if (is_mbm_event(event->attr.config))
+		init_mbm_sample(rmid, event->attr.config);
+
 	event->hw.cqm_rmid = rmid;
 }
 
@@ -930,7 +1020,10 @@ static void intel_cqm_event_read(struct perf_event *event)
 	if (!__rmid_valid(rmid))
 		goto out;
 
-	val = __rmid_read(rmid);
+	if (is_mbm_event(event->attr.config))
+		val = rmid_read_mbm(rmid, event->attr.config);
+	else
+		val = __rmid_read(rmid);
 
 	/*
 	 * Ignore this reading on error states and do not update the value.
@@ -961,6 +1054,17 @@ static inline bool cqm_group_leader(struct perf_event *event)
 	return !list_empty(&event->hw.cqm_groups_entry);
 }
 
+static void __intel_mbm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = rmid_read_mbm(rr->rmid, rr->evt_type);
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+	atomic64_add(val, &rr->value);
+}
+
 static u64 intel_cqm_event_count(struct perf_event *event)
 {
 	unsigned long flags;
@@ -1014,7 +1118,12 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	if (!__rmid_valid(rr.rmid))
 		goto out;
 
-	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+	if (is_mbm_event(event->attr.config)) {
+		rr.evt_type = event->attr.config;
+		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1);
+	} else {
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+	}
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 	if (event->hw.cqm_rmid == rr.rmid)
@@ -1129,7 +1238,8 @@ static int intel_cqm_event_init(struct perf_event *event)
 	if (event->attr.type != intel_cqm_pmu.type)
 		return -ENOENT;
 
-	if (event->attr.config & ~QOS_EVENT_MASK)
+	if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
+	     (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
 		return -EINVAL;
 
 	/* unsupported modes and filters */
-- 
cgit 


From 2d4de8376ff1d94a5070cfa9092c59bfdc4e693e Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Thu, 10 Mar 2016 15:32:11 -0800
Subject: perf/x86/mbm: Implement RMID recycling

RMID could be allocated or deallocated as part of RMID recycling.

When an RMID is allocated for MBM event, the MBM counter needs to be
initialized because next time we read the counter we need the previous
value to account for total bytes that went to the memory controller.

Similarly, when RMID is deallocated we need to update the ->count
variable.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/1457652732-4499-6-git-send-email-vikas.shivappa@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 610bd8ab37e4..a98f472bf6b2 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -450,6 +450,7 @@ struct rmid_read {
 
 static void __intel_cqm_event_count(void *info);
 static void init_mbm_sample(u32 rmid, u32 evt_type);
+static void __intel_mbm_event_count(void *info);
 
 static bool is_mbm_event(int e)
 {
@@ -476,8 +477,14 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 			.rmid = old_rmid,
 		};
 
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-				 &rr, 1);
+		if (is_mbm_event(group->attr.config)) {
+			rr.evt_type = group->attr.config;
+			on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count,
+					 &rr, 1);
+		} else {
+			on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+					 &rr, 1);
+		}
 		local64_set(&group->count, atomic64_read(&rr.value));
 	}
 
@@ -489,6 +496,22 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 
 	raw_spin_unlock_irq(&cache_lock);
 
+	/*
+	 * If the allocation is for mbm, init the mbm stats.
+	 * Need to check if each event in the group is mbm event
+	 * because there could be multiple type of events in the same group.
+	 */
+	if (__rmid_valid(rmid)) {
+		event = group;
+		if (is_mbm_event(event->attr.config))
+			init_mbm_sample(rmid, event->attr.config);
+
+		list_for_each_entry(event, head, hw.cqm_group_entry) {
+			if (is_mbm_event(event->attr.config))
+				init_mbm_sample(rmid, event->attr.config);
+		}
+	}
+
 	return old_rmid;
 }
 
@@ -978,7 +1001,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 			/* All tasks in a group share an RMID */
 			event->hw.cqm_rmid = rmid;
 			*group = iter;
-			if (is_mbm_event(event->attr.config))
+			if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
 				init_mbm_sample(rmid, event->attr.config);
 			return;
 		}
@@ -996,7 +1019,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 	else
 		rmid = __get_rmid();
 
-	if (is_mbm_event(event->attr.config))
+	if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
 		init_mbm_sample(rmid, event->attr.config);
 
 	event->hw.cqm_rmid = rmid;
-- 
cgit 


From e7ee3e8cb550ce43752ae1d1b190d6b5c4150a43 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Fri, 11 Mar 2016 11:26:17 -0800
Subject: perf/x86/mbm: Add support for MBM counter overflow handling

This patch adds a per package timer which periodically updates the
memory bandwidth counters for the events that are currently active.

Current patch has a periodic timer every 1s since the SDM guarantees
that the counter will not overflow in 1s but this time can be definitely
improved by calibrating on the system. The overflow is really a function
of the max memory b/w that the socket can support, max counter value and
scaling factor.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: fenghua.yu@intel.com
Cc: h.peter.anvin@intel.com
Cc: ravi.v.shankar@intel.com
Cc: vikas.shivappa@intel.com
Link: http://lkml.kernel.org/r/013b756c5006b1c4ca411f3ecf43ed52f19fbf87.1457723885.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 139 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 134 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index a98f472bf6b2..380d62da8108 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -14,10 +14,15 @@
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
 #define MBM_CNTR_WIDTH		24
+/*
+ * Guaranteed time in ms as per SDM where MBM counters will not overflow.
+ */
+#define MBM_CTR_OVERFLOW_TIME	1000
 
 static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 static bool cqm_enabled, mbm_enabled;
+unsigned int mbm_socket_max;
 
 /**
  * struct intel_pqr_state - State cache for the PQR MSR
@@ -45,6 +50,7 @@ struct intel_pqr_state {
  * interrupts disabled, which is sufficient for the protection.
  */
 static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static struct hrtimer *mbm_timers;
 /**
  * struct sample - mbm event's (local or total) data
  * @total_bytes    #bytes since we began monitoring
@@ -945,6 +951,10 @@ static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
 		return mbm_current->total_bytes;
 	}
 
+	/*
+	 * The h/w guarantees that counters will not overflow
+	 * so long as we poll them at least once per second.
+	 */
 	shift = 64 - MBM_CNTR_WIDTH;
 	bytes = (val << shift) - (mbm_current->prev_msr << shift);
 	bytes >>= shift;
@@ -1088,6 +1098,84 @@ static void __intel_mbm_event_count(void *info)
 	atomic64_add(val, &rr->value);
 }
 
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct perf_event *iter, *iter1;
+	int ret = HRTIMER_RESTART;
+	struct list_head *head;
+	unsigned long flags;
+	u32 grp_rmid;
+
+	/*
+	 * Need to cache_lock as the timer Event Select MSR reads
+	 * can race with the mbm/cqm count() and mbm_init() reads.
+	 */
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	if (list_empty(&cache_groups)) {
+		ret = HRTIMER_NORESTART;
+		goto out;
+	}
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		grp_rmid = iter->hw.cqm_rmid;
+		if (!__rmid_valid(grp_rmid))
+			continue;
+		if (is_mbm_event(iter->attr.config))
+			update_sample(grp_rmid, iter->attr.config, 0);
+
+		head = &iter->hw.cqm_group_entry;
+		if (list_empty(head))
+			continue;
+		list_for_each_entry(iter1, head, hw.cqm_group_entry) {
+			if (!iter1->hw.is_group_event)
+				break;
+			if (is_mbm_event(iter1->attr.config))
+				update_sample(iter1->hw.cqm_rmid,
+					      iter1->attr.config, 0);
+		}
+	}
+
+	hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return ret;
+}
+
+static void __mbm_start_timer(void *info)
+{
+	hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
+			     HRTIMER_MODE_REL_PINNED);
+}
+
+static void __mbm_stop_timer(void *info)
+{
+	hrtimer_cancel(&mbm_timers[pkg_id]);
+}
+
+static void mbm_start_timers(void)
+{
+	on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
+}
+
+static void mbm_stop_timers(void)
+{
+	on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
+}
+
+static void mbm_hrtimer_init(void)
+{
+	struct hrtimer *hr;
+	int i;
+
+	for (i = 0; i < mbm_socket_max; i++) {
+		hr = &mbm_timers[i];
+		hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hr->function = mbm_hrtimer_handle;
+	}
+}
+
 static u64 intel_cqm_event_count(struct perf_event *event)
 {
 	unsigned long flags;
@@ -1217,8 +1305,14 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
 static void intel_cqm_event_destroy(struct perf_event *event)
 {
 	struct perf_event *group_other = NULL;
+	unsigned long flags;
 
 	mutex_lock(&cache_mutex);
+	/*
+	* Hold the cache_lock as mbm timer handlers could be
+	* scanning the list of events.
+	*/
+	raw_spin_lock_irqsave(&cache_lock, flags);
 
 	/*
 	 * If there's another event in this group...
@@ -1250,6 +1344,14 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 		}
 	}
 
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	/*
+	 * Stop the mbm overflow timers when the last event is destroyed.
+	*/
+	if (mbm_enabled && list_empty(&cache_groups))
+		mbm_stop_timers();
+
 	mutex_unlock(&cache_mutex);
 }
 
@@ -1257,6 +1359,7 @@ static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
 	bool rotate = false;
+	unsigned long flags;
 
 	if (event->attr.type != intel_cqm_pmu.type)
 		return -ENOENT;
@@ -1282,9 +1385,21 @@ static int intel_cqm_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
+	/*
+	 * Start the mbm overflow timers when the first event is created.
+	*/
+	if (mbm_enabled && list_empty(&cache_groups))
+		mbm_start_timers();
+
 	/* Will also set rmid */
 	intel_cqm_setup_event(event, &group);
 
+	/*
+	* Hold the cache_lock as mbm timer handlers be
+	* scanning the list of events.
+	*/
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
 	if (group) {
 		list_add_tail(&event->hw.cqm_group_entry,
 			      &group->hw.cqm_group_entry);
@@ -1303,6 +1418,7 @@ static int intel_cqm_event_init(struct perf_event *event)
 			rotate = true;
 	}
 
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
 	mutex_unlock(&cache_mutex);
 
 	if (rotate)
@@ -1536,20 +1652,33 @@ static const struct x86_cpu_id intel_mbm_total_match[] = {
 
 static int intel_mbm_init(void)
 {
-	int array_size, maxid = cqm_max_rmid + 1;
+	int ret = 0, array_size, maxid = cqm_max_rmid + 1;
 
-	array_size = sizeof(struct sample) * maxid * topology_max_packages();
+	mbm_socket_max = topology_max_packages();
+	array_size = sizeof(struct sample) * maxid * mbm_socket_max;
 	mbm_local = kmalloc(array_size, GFP_KERNEL);
 	if (!mbm_local)
 		return -ENOMEM;
 
 	mbm_total = kmalloc(array_size, GFP_KERNEL);
 	if (!mbm_total) {
-		mbm_cleanup();
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	return 0;
+	array_size = sizeof(struct hrtimer) * mbm_socket_max;
+	mbm_timers = kmalloc(array_size, GFP_KERNEL);
+	if (!mbm_timers) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	mbm_hrtimer_init();
+
+out:
+	if (ret)
+		mbm_cleanup();
+
+	return ret;
 }
 
 static int __init intel_cqm_init(void)
-- 
cgit 


From 27348f382b6786fd201779246ee70fa115a5b890 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Mar 2016 23:39:39 +0100
Subject: perf/x86/cqm: Factor out some common code

Having the same code twice (and once quite ugly) is fragile.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/cqm.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 380d62da8108..7b5fd811ef45 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -463,6 +463,14 @@ static bool is_mbm_event(int e)
 	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
 }
 
+static void cqm_mask_call(struct rmid_read *rr)
+{
+	if (is_mbm_event(rr->evt_type))
+		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
+	else
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
+}
+
 /*
  * Exchange the RMID of a group of events.
  */
@@ -479,18 +487,12 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 	 */
 	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
 		struct rmid_read rr = {
-			.value = ATOMIC64_INIT(0),
 			.rmid = old_rmid,
+			.evt_type = group->attr.config,
+			.value = ATOMIC64_INIT(0),
 		};
 
-		if (is_mbm_event(group->attr.config)) {
-			rr.evt_type = group->attr.config;
-			on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count,
-					 &rr, 1);
-		} else {
-			on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-					 &rr, 1);
-		}
+		cqm_mask_call(&rr);
 		local64_set(&group->count, atomic64_read(&rr.value));
 	}
 
@@ -1180,6 +1182,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 {
 	unsigned long flags;
 	struct rmid_read rr = {
+		.evt_type = event->attr.config,
 		.value = ATOMIC64_INIT(0),
 	};
 
@@ -1229,12 +1232,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	if (!__rmid_valid(rr.rmid))
 		goto out;
 
-	if (is_mbm_event(event->attr.config)) {
-		rr.evt_type = event->attr.config;
-		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1);
-	} else {
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-	}
+	cqm_mask_call(&rr);
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 	if (event->hw.cqm_rmid == rr.rmid)
-- 
cgit 


From 8dfeae0d73bf803be1a533e147b3b0ea69375596 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Thu, 14 Jan 2016 10:50:04 +0800
Subject: perf/x86/amd: Move nodes_per_socket into bsp_init_amd()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nodes_per_socket is static and it needn't be initialized many
times during every CPU core init. So move its initialization into
bsp_init_amd().

Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andreas Herrmann <herrmann.der.user@googlemail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: spg_linux_kernel@amd.com
Link: http://lkml.kernel.org/r/1452739808-11871-2-git-send-email-ray.huang@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b39338c4b260..d4b06e8cd77c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -306,7 +306,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		nodes_per_socket = ((ecx >> 8) & 7) + 1;
 		node_id = ecx & 7;
 
 		/* get compute unit information */
@@ -317,7 +316,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		nodes_per_socket = ((value >> 3) & 7) + 1;
 		node_id = value & 7;
 	} else
 		return;
@@ -519,6 +517,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_MWAITX))
 		use_mwaitx_delay();
+
+	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+		u32 ecx;
+
+		ecx = cpuid_ecx(0x8000001e);
+		nodes_per_socket = ((ecx >> 8) & 7) + 1;
+	} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+		u64 value;
+
+		rdmsrl(MSR_FAM10H_NODE_ID, value);
+		nodes_per_socket = ((value >> 3) & 7) + 1;
+	}
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
-- 
cgit 


From f8519155b4d5224e215203bf0e94a8478a8f8945 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Date: Sun, 28 Feb 2016 22:23:29 -0600
Subject: perf/x86/amd: Add support for new IOMMU performance events

This patch adds new IOMMU performance event based on
the information in table 74 of the AMD I/O Virtualization Technology
(IOMMU) Specification (Document Id: 4882, Rev 2.62, Feb 2015)

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Joerg Roedel <jroedel@suse.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Cc: <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://support.amd.com/TechDocs/48882_IOMMU.pdf
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 635e5eba0caf..40625ca7a190 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
 	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
 	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
 	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+	AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h,    "csource=0x14"),
+	AMD_IOMMU_EVENT_DESC(vapic_int_non_guest,     "csource=0x15"),
+	AMD_IOMMU_EVENT_DESC(vapic_int_guest,         "csource=0x16"),
+	AMD_IOMMU_EVENT_DESC(smi_recv,                "csource=0x17"),
+	AMD_IOMMU_EVENT_DESC(smi_blk,                 "csource=0x18"),
 	{ /* end: all zeroes */ },
 };
 
-- 
cgit 


From 01fe03ff1c7ce6b6e2212cb6171a49c2858fbc7c Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Thu, 14 Jan 2016 10:50:06 +0800
Subject: x86/cpufeature, perf/x86: Add AMD Accumulated Power Mechanism feature
 flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AMD CPU family 15h model 0x60 introduces a mechanism for measuring
accumulated power. It is used to report the processor power consumption
and support for it is indicated by CPUID Fn8000_0007_EDX[12].

Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andreas Herrmann <herrmann.der.user@googlemail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Kristen Carlson Accardi <kristen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wan Zongshun <Vincent.Wan@amd.com>
Cc: spg_linux_kernel@amd.com
Link: http://lkml.kernel.org/r/1452739808-11871-4-git-send-email-ray.huang@amd.com
[ Resolved conflict and moved the synthetic CPUID slot to 19. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/cpu/amd.c          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 746dd6ae4932..44ebd04878eb 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -94,7 +94,7 @@
 #define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
 #define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
 #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_ACC_POWER	( 3*32+19) /* AMD Accumulated Power Mechanism */
 #define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
 #define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
 #define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d4b06e8cd77c..68fe8d3bed56 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -546,6 +546,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 			set_sched_clock_stable();
 	}
 
+	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+	if (c->x86_power & BIT(12))
+		set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 #else
-- 
cgit 


From c7ab62bfbe0e27ef452d19d88b083f01e99f13a7 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Wed, 9 Mar 2016 13:45:06 +0800
Subject: perf/x86/amd/power: Add AMD accumulated power reporting mechanism

Introduce an AMD accumlated power reporting mechanism for the Family
15h, Model 60h processor that can be used to calculate the average
power consumed by a processor during a measurement interval. The
feature support is indicated by CPUID Fn8000_0007_EDX[12].

This feature will be implemented both in hwmon and perf. The current
design provides one event to report per package/processor power
consumption by counting each compute unit power value.

Here the gory details of how the computation is done:

* Tsample: compute unit power accumulator sample period
* Tref: the PTSC counter period (PTSC: performance timestamp counter)
* N: the ratio of compute unit power accumulator sample period to the
  PTSC period

* Jmax: max compute unit accumulated power which is indicated by
  MSR_C001007b[MaxCpuSwPwrAcc]

* Jx/Jy: compute unit accumulated power which is indicated by
  MSR_C001007a[CpuSwPwrAcc]

* Tx/Ty: the value of performance timestamp counter which is indicated
  by CU_PTSC MSR_C0010280[PTSC]
* PwrCPUave: CPU average power

i. Determine the ratio of Tsample to Tref by executing CPUID Fn8000_0007.
	N = value of CPUID Fn8000_0007_ECX[CpuPwrSampleTimeRatio[15:0]].

ii. Read the full range of the cumulative energy value from the new
    MSR MaxCpuSwPwrAcc.
	Jmax = value returned.

iii. At time x, software reads CpuSwPwrAcc and samples the PTSC.
	Jx = value read from CpuSwPwrAcc and Tx = value read from PTSC.

iv. At time y, software reads CpuSwPwrAcc and samples the PTSC.
	Jy = value read from CpuSwPwrAcc and Ty = value read from PTSC.

v. Calculate the average power consumption for a compute unit over
time period (y-x). Unit of result is uWatt:

	if (Jy < Jx) // Rollover has occurred
		Jdelta = (Jy + Jmax) - Jx
	else
		Jdelta = Jy - Jx
	PwrCPUave = N * Jdelta * 1000 / (Ty - Tx)

Simple example:

  root@hr-zp:/home/ray/tip# ./tools/perf/perf stat -a -e 'power/power-pkg/' make -j4
    CHK     include/config/kernel.release
    CHK     include/generated/uapi/linux/version.h
    CHK     include/generated/utsrelease.h
    CHK     include/generated/timeconst.h
    CHK     include/generated/bounds.h
    CHK     include/generated/asm-offsets.h
    CALL    scripts/checksyscalls.sh
    CHK     include/generated/compile.h
    SKIPPED include/generated/compile.h
    Building modules, stage 2.
  Kernel: arch/x86/boot/bzImage is ready  (#40)
    MODPOST 4225 modules

   Performance counter stats for 'system wide':

              183.44 mWatts power/power-pkg/

       341.837270111 seconds time elapsed

  root@hr-zp:/home/ray/tip# ./tools/perf/perf stat -a -e 'power/power-pkg/' sleep 10

   Performance counter stats for 'system wide':

                0.18 mWatts power/power-pkg/

        10.012551815 seconds time elapsed

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jacob.w.shin@gmail.com
Link: http://lkml.kernel.org/r/1457502306-2559-1-git-send-email-ray.huang@amd.com
[ Fixed the modular build. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig            |   9 ++
 arch/x86/events/Makefile    |   1 +
 arch/x86/events/amd/power.c | 353 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/events/core.c      |   4 +-
 4 files changed, 365 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/events/amd/power.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8f2e6659281b..a313c0e7e165 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1206,6 +1206,15 @@ config MICROCODE_OLD_INTERFACE
 	def_bool y
 	depends on MICROCODE
 
+config PERF_EVENTS_AMD_POWER
+	depends on PERF_EVENTS && CPU_SUP_AMD
+	tristate "AMD Processor Power Reporting Mechanism"
+	---help---
+	  Provide power reporting mechanism support for AMD processors.
+	  Currently, it leverages X86_FEATURE_ACC_POWER
+	  (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+	  average power consumption on Family 15h processors.
+
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	---help---
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index fdfea1511cc0..f59618a39990 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,7 @@
 obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER)	+= amd/power.o
 obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000000..55a3529dbf12
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,353 @@
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_F15H_CU_PWR_ACCUMULATOR     0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PTSC			0xc0010280
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK		0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG		1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+	u64 delta, tdelta;
+
+	prev_pwr_acc = hwc->pwr_acc;
+	prev_ptsc = hwc->ptsc;
+	rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+	rdmsrl(MSR_F15H_PTSC, new_ptsc);
+
+	/*
+	 * Calculate the CU power consumption over a time period, the unit of
+	 * final value (delta) is micro-Watts. Then add it to the event count.
+	 */
+	if (new_pwr_acc < prev_pwr_acc) {
+		delta = max_cu_acc_power + new_pwr_acc;
+		delta -= prev_pwr_acc;
+	} else
+		delta = new_pwr_acc - prev_pwr_acc;
+
+	delta *= cpu_pwr_sample_ratio * 1000;
+	tdelta = new_ptsc - prev_ptsc;
+
+	do_div(delta, tdelta);
+	local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
+	rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+	__pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Mark event as deactivated and stopped. */
+	if (!(hwc->state & PERF_HES_STOPPED))
+		hwc->state |= PERF_HES_STOPPED;
+
+	/* Check if software counter update is necessary. */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of an event
+		 * that we are disabling:
+		 */
+		event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__pmu_event_start(event);
+
+	return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+	pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+	/* Only look at AMD power events. */
+	if (event->attr.type != pmu_class.type)
+		return -ENOENT;
+
+	/* Unsupported modes and filters. */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    /* no sampling */
+	    event->attr.sample_period)
+		return -EINVAL;
+
+	if (cfg != AMD_POWER_EVENTSEL_PKG)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+	event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+	.attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+	EVENT_PTR(power_pkg),
+	EVENT_PTR(power_pkg_unit),
+	EVENT_PTR(power_pkg_scale),
+	NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+	.name	= "events",
+	.attrs	= events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+	.name	= "format",
+	.attrs	= formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&pmu_attr_group,
+	&pmu_format_group,
+	&pmu_events_group,
+	NULL,
+};
+
+static struct pmu pmu_class = {
+	.attr_groups	= attr_groups,
+	/* system-wide only */
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= pmu_event_init,
+	.add		= pmu_event_add,
+	.del		= pmu_event_del,
+	.start		= pmu_event_start,
+	.stop		= pmu_event_stop,
+	.read		= pmu_event_read,
+};
+
+static void power_cpu_exit(int cpu)
+{
+	int target;
+
+	if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+		return;
+
+	/*
+	 * Find a new CPU on the same compute unit, if was set in cpumask
+	 * and still some CPUs on compute unit. Then migrate event and
+	 * context to new CPU.
+	 */
+	target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+	if (target < nr_cpumask_bits) {
+		cpumask_set_cpu(target, &cpu_mask);
+		perf_pmu_migrate_context(&pmu_class, cpu, target);
+	}
+}
+
+static void power_cpu_init(int cpu)
+{
+	int target;
+
+	/*
+	 * 1) If any CPU is set at cpu_mask in the same compute unit, do
+	 * nothing.
+	 * 2) If no CPU is set at cpu_mask in the same compute unit,
+	 * set current STARTING CPU.
+	 *
+	 * Note: if there is a CPU aside of the new one already in the
+	 * sibling mask, then it is also in cpu_mask.
+	 */
+	target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+	if (target >= nr_cpumask_bits)
+		cpumask_set_cpu(cpu, &cpu_mask);
+}
+
+static int
+power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_STARTING:
+		power_cpu_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		power_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block power_cpu_notifier_nb = {
+	.notifier_call = power_cpu_notifier,
+	.priority = CPU_PRI_PERF,
+};
+
+static const struct x86_cpu_id cpu_match[] = {
+	{ .vendor = X86_VENDOR_AMD, .family = 0x15 },
+	{},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+	int cpu, target, ret;
+
+	if (!x86_match_cpu(cpu_match))
+		return 0;
+
+	if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+		return -ENODEV;
+
+	cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+	if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+		pr_err("Failed to read max compute unit power accumulator MSR\n");
+		return -ENODEV;
+	}
+
+	cpu_notifier_register_begin();
+
+	/* Choose one online core of each compute unit. */
+	for_each_online_cpu(cpu) {
+		target = cpumask_first(topology_sibling_cpumask(cpu));
+		if (!cpumask_test_cpu(target, &cpu_mask))
+			cpumask_set_cpu(target, &cpu_mask);
+	}
+
+	ret = perf_pmu_register(&pmu_class, "power", -1);
+	if (WARN_ON(ret)) {
+		pr_warn("AMD Power PMU registration failed\n");
+		goto out;
+	}
+
+	__register_cpu_notifier(&power_cpu_notifier_nb);
+
+	pr_info("AMD Power PMU detected\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&power_cpu_notifier_nb);
+	cpu_notifier_register_done();
+
+	perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e830d0c95c9..002b2eadd600 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
 	return new;
 }
 
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-			  char *page)
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
 {
 	struct perf_pmu_events_attr *pmu_attr = \
 		container_of(attr, struct perf_pmu_events_attr, attr);
@@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 
 	return x86_pmu.events_sysfs_show(page, config);
 }
+EXPORT_SYMBOL_GPL(events_sysfs_show);
 
 EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
 EVENT_ATTR(instructions,		INSTRUCTIONS		);
-- 
cgit 


From cb2252522aaff572f28dc6613307e1e0e62496cd Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Sun, 20 Mar 2016 11:58:21 -0700
Subject: perf/x86/intel/uncore: Remove ev_sel_ext bit support for PCU

The ev_sel_ext in PCU_MSR_PMON_CTL is locked on some CPU models, so despite
it being documented in the SDM, if we write 1 to that bit then we can get a #GP
fault.

Which #GP the perf fuzzer happily triggered in Peter Zijlstra's testing.

Also, there are no public events which use that bit, so remove ev_sel_ext
bit support for PCU.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1458500301-3594-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore_snbep.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 93f6bd9bf761..ab2bcaaebe38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -46,7 +46,6 @@
 				(SNBEP_PMON_CTL_EV_SEL_MASK | \
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
 				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
 				 SNBEP_PMON_CTL_INVERT | \
 				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -148,7 +147,6 @@
 /* IVBEP PCU */
 #define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
 				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
 				 SNBEP_PMON_CTL_EDGE_DET | \
 				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
@@ -258,7 +256,6 @@
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
 				 SNBEP_PMON_CTL_EDGE_DET | \
 				 SNBEP_CBO_PMON_CTL_TID_EN | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
 				 SNBEP_PMON_CTL_INVERT | \
 				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
 };
 
 static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
+	&format_attr_event.attr,
 	&format_attr_occ_sel.attr,
 	&format_attr_edge.attr,
 	&format_attr_inv.attr,
@@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
 };
 
 static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
+	&format_attr_event.attr,
 	&format_attr_occ_sel.attr,
 	&format_attr_edge.attr,
 	&format_attr_thresh5.attr,
-- 
cgit 


From 7b0fd56930399d87241ad2f49d48c315307471ee Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sun, 20 Mar 2016 16:52:18 -0700
Subject: perf/x86/intel/rapl: Add missing Broadwell models

Added Broadwell-H and Broadwell-Server.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/1458517938-25308-1-git-send-email-srinivas.pandruvada@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/rapl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index b834a3f55a01..70c93f9b03ac 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
 	case 63: /* Haswell-Server */
+	case 79: /* Broadwell-Server */
 		apply_quirk = true;
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
@@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void)
 	case 60: /* Haswell */
 	case 69: /* Haswell-Celeron */
 	case 61: /* Broadwell */
+	case 71: /* Broadwell-H */
 		rapl_cntr_mask = RAPL_IDX_HSW;
 		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
 		break;
-- 
cgit 


From 2849eb4f99d54925c543db12917127f88b3c38ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 18 Mar 2016 16:53:29 +0100
Subject: KVM: VMX: avoid guest hang on invalid invept instruction

A guest executing an invalid invept instruction would hang
because the instruction pointer was not updated.

Cc: stable@vger.kernel.org
Fixes: bfd0a56b90005f8c8a004baf407ad90045c2b11e
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75173efccac5..01f515873637 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7399,6 +7399,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	if (!(types & (1UL << type))) {
 		nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		skip_emulated_instruction(vcpu);
 		return 1;
 	}
 
-- 
cgit 


From f6870ee9e53430f2a318ccf0dd5e66bb46194e43 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 18 Mar 2016 16:53:42 +0100
Subject: KVM: VMX: avoid guest hang on invalid invvpid instruction

A guest executing an invalid invvpid instruction would hang
because the instruction pointer was not updated.

Reported-by: jmontleo@redhat.com
Tested-by: jmontleo@redhat.com
Cc: stable@vger.kernel.org
Fixes: 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 01f515873637..0ce009cc23f1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7458,6 +7458,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	if (!(types & (1UL << type))) {
 		nested_vmx_failValid(vcpu,
 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		skip_emulated_instruction(vcpu);
 		return 1;
 	}
 
-- 
cgit 


From ef697a712a6165aea7779c295604b099e8bfae2e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 18 Mar 2016 16:58:38 +0100
Subject: KVM: VMX: fix nested vpid for old KVM guests

Old KVM guests invoke single-context invvpid without actually checking
whether it is supported.  This was fixed by commit 518c8ae ("KVM: VMX:
Make sure single type invvpid is supported before issuing invvpid
instruction", 2010-08-01) and the patch after, but pre-2.6.36
kernels lack it including RHEL 6.

Reported-by: jmontleo@redhat.com
Tested-by: jmontleo@redhat.com
Cc: stable@vger.kernel.org
Fixes: 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0ce009cc23f1..161230016dfa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2712,8 +2712,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	/*
+	 * Old versions of KVM use the single-context version without
+	 * checking for support, so declare that it is supported even
+	 * though it is treated as global context.  The alternative is
+	 * not failing the single-context invvpid, and it is worse.
+	 */
 	if (enable_vpid)
 		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+				VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
 				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 	else
 		vmx->nested.nested_vmx_vpid_caps = 0;
@@ -7475,12 +7482,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	}
 
 	switch (type) {
+	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+		/*
+		 * Old versions of KVM use the single-context version so we
+		 * have to support it; just treat it the same as all-context.
+		 */
 	case VMX_VPID_EXTENT_ALL_CONTEXT:
 		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
 		nested_vmx_succeed(vcpu);
 		break;
 	default:
-		/* Trap single context invalidation invvpid calls */
+		/* Trap individual address invalidation invvpid calls */
 		BUG_ON(1);
 		break;
 	}
-- 
cgit 


From f13577e8aa863cf3b4f17541bc74f8e9b0b40c90 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Mar 2016 10:08:16 +0100
Subject: KVM: MMU: return page fault error code from permission_fault

This will help in the implementation of PKRU, where the PK bit of the page
fault error code cannot be computed in advance (unlike I/D, R/W and U/S).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h         | 15 ++++++++++-----
 arch/x86/kvm/paging_tmpl.h |  5 ++---
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 58fe98a0a526..8b2b3dfca1ae 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -141,11 +141,15 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Will a fault with a given page-fault error code (pfec) cause a permission
- * fault with the given access (in ACC_* format)?
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
  */
-static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				    unsigned pte_access, unsigned pfec)
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				  unsigned pte_access, unsigned pfec)
 {
 	int cpl = kvm_x86_ops->get_cpl(vcpu);
 	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -169,7 +173,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 	WARN_ON(pfec & PFERR_RSVD_MASK);
 
-	return (mmu->permissions[index] >> pte_access) & 1;
+	pfec |= PFERR_PRESENT_MASK;
+	return -((mmu->permissions[index] >> pte_access) & 1) & pfec;
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e159a8185ad9..45ed3c38d20e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -359,10 +359,9 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
 
-	if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
-		errcode |= PFERR_PRESENT_MASK;
+	errcode = permission_fault(vcpu, mmu, pte_access, access);
+	if (unlikely(errcode))
 		goto error;
-	}
 
 	gfn = gpte_to_gfn_lvl(pte, walker->level);
 	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
-- 
cgit 


From e0b18ef7189075676ac432954d7920eaa30d8e3e Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:14 +0800
Subject: KVM: x86: remove magic number with enum cpuid_leafs

This patch removes magic number with enum cpuid_leafs.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0029644bf09c..00a69ecdd401 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -305,7 +305,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 
 	/* cpuid 1.edx */
-	const u32 kvm_supported_word0_x86_features =
+	const u32 kvm_cpuid_1_edx_x86_features =
 		F(FPU) | F(VME) | F(DE) | F(PSE) |
 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
@@ -315,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
 		0 /* HTT, TM, Reserved, PBE */;
 	/* cpuid 0x80000001.edx */
-	const u32 kvm_supported_word1_x86_features =
+	const u32 kvm_cpuid_8000_0001_edx_x86_features =
 		F(FPU) | F(VME) | F(DE) | F(PSE) |
 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
@@ -325,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
-	const u32 kvm_supported_word4_x86_features =
+	const u32 kvm_cpuid_1_ecx_x86_features =
 		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
 		 * but *not* advertised to guests via CPUID ! */
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
@@ -337,27 +337,27 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
 		F(F16C) | F(RDRAND);
 	/* cpuid 0x80000001.ecx */
-	const u32 kvm_supported_word6_x86_features =
+	const u32 kvm_cpuid_8000_0001_ecx_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 	/* cpuid 0xC0000001.edx */
-	const u32 kvm_supported_word5_x86_features =
+	const u32 kvm_cpuid_C000_0001_edx_x86_features =
 		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 		F(PMM) | F(PMM_EN);
 
 	/* cpuid 7.0.ebx */
-	const u32 kvm_supported_word9_x86_features =
+	const u32 kvm_cpuid_7_0_ebx_x86_features =
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
 		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
 	/* cpuid 0xD.1.eax */
-	const u32 kvm_supported_word10_x86_features =
+	const u32 kvm_cpuid_D_1_eax_x86_features =
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
 	/* all calls to cpuid_count() should be made on the same cpu */
@@ -376,10 +376,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		cpuid_mask(&entry->edx, 0);
-		entry->ecx &= kvm_supported_word4_x86_features;
-		cpuid_mask(&entry->ecx, 4);
+		entry->edx &= kvm_cpuid_1_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_1_EDX);
+		entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+		cpuid_mask(&entry->ecx, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
 		entry->ecx |= F(X2APIC);
@@ -433,8 +433,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* Mask ebx against host capability word 9 */
 		if (index == 0) {
-			entry->ebx &= kvm_supported_word9_x86_features;
-			cpuid_mask(&entry->ebx, 9);
+			entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+			cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
 			// TSC_ADJUST is emulated
 			entry->ebx |= F(TSC_ADJUST);
 		} else
@@ -514,7 +514,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 			do_cpuid_1_ent(&entry[i], function, idx);
 			if (idx == 1) {
-				entry[i].eax &= kvm_supported_word10_x86_features;
+				entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
 				entry[i].ebx = 0;
 				if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
 					entry[i].ebx =
@@ -564,10 +564,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0x8000001a);
 		break;
 	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		cpuid_mask(&entry->edx, 1);
-		entry->ecx &= kvm_supported_word6_x86_features;
-		cpuid_mask(&entry->ecx, 6);
+		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+		cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -600,8 +600,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0xC0000004);
 		break;
 	case 0xC0000001:
-		entry->edx &= kvm_supported_word5_x86_features;
-		cpuid_mask(&entry->edx, 5);
+		entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
-- 
cgit 


From ddba262891494c1337fcbec8910605285012c121 Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:15 +0800
Subject: KVM, pkeys: disable pkeys for guests in non-paging mode

Pkeys is disabled if CPU is in non-paging mode in hardware. However KVM
always uses paging mode to emulate guest non-paging, mode with TDP. To
emulate this behavior, pkeys needs to be manually disabled when guest
switches to non-paging mode.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 161230016dfa..1b8a41f64d35 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3893,13 +3893,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	if (!enable_unrestricted_guest && !is_paging(vcpu))
 		/*
-		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
-		 * hardware.  However KVM always uses paging mode without
-		 * unrestricted guest.
-		 * To emulate this behavior, SMEP/SMAP needs to be manually
-		 * disabled when guest switches to non-paging mode.
+		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
+		 * to be manually disabled when guest switches to non-paging
+		 * mode.
+		 *
+		 * If !enable_unrestricted_guest, the CPU is always running
+		 * with CR0.PG=1 and CR4 needs to be modified.
+		 * If enable_unrestricted_guest, the CPU automatically
+		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
 		 */
-		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
-- 
cgit 


From 17a511f878505d31f298de443269b3070c134163 Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:16 +0800
Subject: KVM, pkeys: add pkeys support for xsave state

This patch adds pkeys support for xsave state.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 007940faa5c6..7ce3634ab5fe 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void);
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+				| XFEATURE_MASK_PKRU)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
-- 
cgit 


From 9e90199c25aec31b4509213881511948f6c763c8 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Tue, 22 Mar 2016 16:51:17 +0800
Subject: x86: pkey: introduce write_pkru() for KVM

KVM will use it to switch pkru between guest and host.

CC: Ingo Molnar <mingo@redhat.com>
CC: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/pgtable.h       |  6 ++++++
 arch/x86/include/asm/special_insns.h | 16 ++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1ff49ec29ece..97f3242e133c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -107,6 +107,12 @@ static inline u32 read_pkru(void)
 	return 0;
 }
 
+static inline void write_pkru(u32 pkru)
+{
+	if (boot_cpu_has(X86_FEATURE_OSPKE))
+		__write_pkru(pkru);
+}
+
 static inline int pte_young(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_ACCESSED;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index aee6e76e561e..d96d04377765 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -113,11 +113,27 @@ static inline u32 __read_pkru(void)
 		     : "c" (ecx));
 	return pkru;
 }
+
+static inline void __write_pkru(u32 pkru)
+{
+	u32 ecx = 0, edx = 0;
+
+	/*
+	 * "wrpkru" instruction.  Loads contents in EAX to PKRU,
+	 * requires that ecx = edx = 0.
+	 */
+	asm volatile(".byte 0x0f,0x01,0xef\n\t"
+		     : : "a" (pkru), "c"(ecx), "d"(edx));
+}
 #else
 static inline u32 __read_pkru(void)
 {
 	return 0;
 }
+
+static inline void __write_pkru(u32 pkru)
+{
+}
 #endif
 
 static inline void native_wbinvd(void)
-- 
cgit 


From 1be0e61c1f255faaeab04a390e00c8b9b9042870 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Tue, 22 Mar 2016 16:51:18 +0800
Subject: KVM, pkeys: save/restore PKRU when guest/host switches

Currently XSAVE state of host is not restored after VM-exit and PKRU
is managed by XSAVE so the PKRU from guest is still controlling the
memory access even if the CPU is running the code of host. This is
not safe as KVM needs to access the memory of userspace (e,g QEMU) to
do some emulation.

So we save/restore PKRU when guest/host switches.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b8a41f64d35..87664775b51e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
 	struct page *pml_pg;
 
 	u64 current_tsc_ratio;
+
+	bool guest_pkru_valid;
+	u32 guest_pkru;
+	u32 host_pkru;
 };
 
 enum segment_cache_field {
@@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 }
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	vmx_vcpu_pi_load(vcpu, cpu);
+	vmx->host_pkru = read_pkru();
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -8639,6 +8645,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	if (vmx->guest_pkru_valid)
+		__write_pkru(vmx->guest_pkru);
+
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
 
@@ -8778,6 +8787,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	/*
+	 * eager fpu is enabled if PKEY is supported and CR4 is switched
+	 * back on host, so it is safe to read guest PKRU from current
+	 * XSAVE.
+	 */
+	if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+		vmx->guest_pkru = __read_pkru();
+		if (vmx->guest_pkru != vmx->host_pkru) {
+			vmx->guest_pkru_valid = true;
+			__write_pkru(vmx->host_pkru);
+		} else
+			vmx->guest_pkru_valid = false;
+	}
+
 	/*
 	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
 	 * we did not inject a still-pending event to L1 now because of
-- 
cgit 


From 2d344105f57ca77fc9c5d4377f65d1082f71ac4b Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:19 +0800
Subject: KVM, pkeys: introduce pkru_mask to cache conditions

PKEYS defines a new status bit in the PFEC. PFEC.PK (bit 5), if some
conditions is true, the fault is considered as a PKU violation.
pkru_mask indicates if we need to check PKRU.ADi and PKRU.WDi, and
does cache some conditions for permission_fault.

[ Huaitong: Xiao helps to modify many sections. ]

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 +++++
 arch/x86/kvm/mmu.c              | 80 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c8b501cb6d..8968165963e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,14 @@ struct kvm_mmu {
 	 */
 	u8 permissions[16];
 
+	/*
+	* The pkru_mask indicates if protection key checks are needed.  It
+	* consists of 16 domains indexed by page fault error code bits [4:1],
+	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+	*/
+	u32 pkru_mask;
+
 	u64 *pae_root;
 	u64 *lm_root;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c512f095cdac..e46a81f4856e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3923,6 +3923,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 	}
 }
 
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register.  Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions.  The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD.  For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				bool ept)
+{
+	unsigned bit;
+	bool wp;
+
+	if (ept) {
+		mmu->pkru_mask = 0;
+		return;
+	}
+
+	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+		mmu->pkru_mask = 0;
+		return;
+	}
+
+	wp = is_write_protection(vcpu);
+
+	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+		unsigned pfec, pkey_bits;
+		bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+		pfec = bit << 1;
+		ff = pfec & PFERR_FETCH_MASK;
+		uf = pfec & PFERR_USER_MASK;
+		wf = pfec & PFERR_WRITE_MASK;
+
+		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
+		pte_user = pfec & PFERR_RSVD_MASK;
+
+		/*
+		 * Only need to check the access which is not an
+		 * instruction fetch and is to a user page.
+		 */
+		check_pkey = (!ff && pte_user);
+		/*
+		 * write access is controlled by PKRU if it is a
+		 * user access or CR0.WP = 1.
+		 */
+		check_write = check_pkey && wf && (uf || wp);
+
+		/* PKRU.AD stops both read and write access. */
+		pkey_bits = !!check_pkey;
+		/* PKRU.WD stops write access. */
+		pkey_bits |= (!!check_write) << 1;
+
+		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+	}
+}
+
 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 {
 	unsigned root_level = mmu->root_level;
@@ -3941,6 +4016,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
+	update_pkru_bitmask(vcpu, context, false);
 	update_last_nonleaf_level(vcpu, context);
 
 	MMU_WARN_ON(!is_pae(vcpu));
@@ -3968,6 +4044,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
+	update_pkru_bitmask(vcpu, context, false);
 	update_last_nonleaf_level(vcpu, context);
 
 	context->page_fault = paging32_page_fault;
@@ -4026,6 +4103,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, context, false);
+	update_pkru_bitmask(vcpu, context, false);
 	update_last_nonleaf_level(vcpu, context);
 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
@@ -4078,6 +4156,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->direct_map = false;
 
 	update_permission_bitmask(vcpu, context, true);
+	update_pkru_bitmask(vcpu, context, true);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
@@ -4132,6 +4211,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, g_context, false);
+	update_pkru_bitmask(vcpu, g_context, false);
 	update_last_nonleaf_level(vcpu, g_context);
 }
 
-- 
cgit 


From be94f6b71067df47d623fc6c6983a8dee504fb4d Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:20 +0800
Subject: KVM, pkeys: add pkeys support for permission_fault

Protection keys define a new 4-bit protection key field (PKEY) in bits
62:59 of leaf entries of the page tables, the PKEY is an index to PKRU
register(16 domains), every domain has 2 bits(write disable bit, access
disable bit).

Static logic has been produced in update_pkru_bitmask, dynamic logic need
read pkey from page table entries, get pkru value, and deduce the correct
result.

[ Huaitong: Xiao helps to modify many sections. ]

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/kvm_cache_regs.h   |  5 +++++
 arch/x86/kvm/mmu.h              | 33 ++++++++++++++++++++++++++++-----
 arch/x86/kvm/paging_tmpl.h      | 16 ++++++++++++++--
 arch/x86/kvm/svm.c              |  8 ++++++++
 arch/x86/kvm/vmx.c              |  8 ++++++++
 arch/x86/kvm/x86.c              |  7 ++++++-
 7 files changed, 72 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8968165963e4..0d75ecdfa077 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -187,12 +187,14 @@ enum {
 #define PFERR_USER_BIT 2
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
 
 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
 #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
 #define PFERR_USER_MASK (1U << PFERR_USER_BIT)
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
@@ -882,6 +884,7 @@ struct kvm_x86_ops {
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+	u32 (*get_pkru)(struct kvm_vcpu *vcpu);
 	void (*fpu_activate)(struct kvm_vcpu *vcpu);
 	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4af75..762cdf2595f9 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
 		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
 }
 
+static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->get_pkru(vcpu);
+}
+
 static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8b2b3dfca1ae..b70df72e2b33 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -10,10 +10,11 @@
 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
 
 #define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
 
 #define PT_PRESENT_MASK (1ULL << 0)
 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
 #define PT_PWT_MASK (1ULL << 3)
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
@@ -149,7 +150,8 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
  * if the access faults.
  */
 static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				  unsigned pte_access, unsigned pfec)
+				  unsigned pte_access, unsigned pte_pkey,
+				  unsigned pfec)
 {
 	int cpl = kvm_x86_ops->get_cpl(vcpu);
 	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -170,11 +172,32 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
 	int index = (pfec >> 1) +
 		    (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+	bool fault = (mmu->permissions[index] >> pte_access) & 1;
 
-	WARN_ON(pfec & PFERR_RSVD_MASK);
-
+	WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
 	pfec |= PFERR_PRESENT_MASK;
-	return -((mmu->permissions[index] >> pte_access) & 1) & pfec;
+
+	if (unlikely(mmu->pkru_mask)) {
+		u32 pkru_bits, offset;
+
+		/*
+		* PKRU defines 32 bits, there are 16 domains and 2
+		* attribute bits per domain in pkru.  pte_pkey is the
+		* index of the protection domain, so pte_pkey * 2 is
+		* is the index of the first bit for the domain.
+		*/
+		pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+
+		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+		offset = pfec - 1 +
+			((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+
+		pkru_bits &= mmu->pkru_mask >> offset;
+		pfec |= -pkru_bits & PFERR_PK_MASK;
+		fault |= (pkru_bits != 0);
+	}
+
+	return -(uint32_t)fault & pfec;
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 45ed3c38d20e..e0c225421157 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned pkeys = 0;
+#if PTTYPE == 64
+	pte_t pte = {.pte = gpte};
+
+	pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+	return pkeys;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access, accessed_dirty;
+	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
 	gpa_t pte_gpa;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
@@ -359,7 +370,8 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
 
-	errcode = permission_fault(vcpu, mmu, pte_access, access);
+	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+	errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
 	if (unlikely(errcode))
 		goto error;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 95070386d599..31346a3f20a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
 	switch (reg) {
@@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
+
+	.get_pkru = svm_get_pkru,
+
 	.fpu_activate = svm_fpu_activate,
 	.fpu_deactivate = svm_fpu_deactivate,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 87664775b51e..efc243e4dabf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2292,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->guest_pkru;
+}
+
 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 {
 	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -10925,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
+
+	.get_pkru = vmx_get_pkru,
+
 	.fpu_activate = vmx_fpu_activate,
 	.fpu_deactivate = vmx_fpu_deactivate,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7236bd3a4c3d..d0a5918da654 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4326,9 +4326,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
 		| (write ? PFERR_WRITE_MASK : 0);
 
+	/*
+	 * currently PKRU is only applied to ept enabled guest so
+	 * there is no pkey in EPT page table for L1 guest or EPT
+	 * shadow page table for L2 guest.
+	 */
 	if (vcpu_match_mmio_gva(vcpu, gva)
 	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-				 vcpu->arch.access, access)) {
+				 vcpu->arch.access, 0, access)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
 		trace_vcpu_match_mmio(gva, *gpa, write, false);
-- 
cgit 


From b9baba861489041b37b54fc7ee0b0006b5327151 Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Tue, 22 Mar 2016 16:51:21 +0800
Subject: KVM, pkeys: expose CPUID/CR4 to guest

X86_FEATURE_PKU is referred to as "PKU" in the hardware documentation:
CPUID.7.0.ECX[3]:PKU. X86_FEATURE_OSPKE is software support for pkeys,
enumerated with CPUID.7.0.ECX[4]:OSPKE, and it reflects the setting of
CR4.PKE(bit 22).

This patch disables CPUID:PKU without ept, because pkeys is not yet
implemented for shadow paging.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/cpuid.c            | 23 +++++++++++++++++++++--
 arch/x86/kvm/cpuid.h            |  8 ++++++++
 arch/x86/kvm/x86.c              |  9 ++++++---
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0d75ecdfa077..f62a9f37f79f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,7 +84,8 @@
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
-			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
+			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
+			  | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 00a69ecdd401..8efb839948e5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
 
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (best) {
+		/* Update OSPKE bit */
+		if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+			best->ecx &= ~F(OSPKE);
+			if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+				best->ecx |= F(OSPKE);
+		}
+	}
+
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
 		vcpu->arch.guest_supported_xcr0 = 0;
@@ -360,6 +370,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_cpuid_D_1_eax_x86_features =
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
+	/* cpuid 7.0.ecx*/
+	const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
@@ -437,10 +450,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
 			// TSC_ADJUST is emulated
 			entry->ebx |= F(TSC_ADJUST);
-		} else
+			entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+			cpuid_mask(&entry->ecx, CPUID_7_ECX);
+			/* PKU is not yet implemented for shadow paging. */
+			if (!tdp_enabled)
+				entry->ecx &= ~F(PKU);
+		} else {
 			entry->ebx = 0;
+			entry->ecx = 0;
+		}
 		entry->eax = 0;
-		entry->ecx = 0;
 		entry->edx = 0;
 		break;
 	}
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 66a6581724ad..e17a74b1d852 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
 static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0a5918da654..98ae1fb9fd66 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-				   X86_CR4_SMEP | X86_CR4_SMAP;
+				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
 
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
@@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
 		return 1;
 
+	if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 		kvm_mmu_reset_context(vcpu);
 
-	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
 		kvm_update_cpuid(vcpu);
 
 	return 0;
@@ -7128,7 +7131,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-	if (sregs->cr4 & X86_CR4_OSXSAVE)
+	if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
 		kvm_update_cpuid(vcpu);
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-- 
cgit 


From 9753f529156d03112d1ff6f3758ccd6fa6faf189 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Sun, 13 Mar 2016 11:10:24 +0800
Subject: KVM: Remove redundant smp_mb() in the kvm_mmu_commit_zap_page()

There is already a barrier inside of kvm_flush_remote_tlbs() which can
help to make sure everyone sees our modifications to the page tables and
see changes to vcpu->mode here. So remove the smp_mb in the
kvm_mmu_commit_zap_page() and update the comment.

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e46a81f4856e..fc8ab1f67d00 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2390,14 +2390,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		return;
 
 	/*
-	 * wmb: make sure everyone sees our modifications to the page tables
-	 * rmb: make sure we see changes to vcpu->mode
-	 */
-	smp_mb();
-
-	/*
-	 * Wait for all vcpus to exit guest mode and/or lockless shadow
-	 * page table walks.
+	 * We need to make sure everyone sees our modifications to
+	 * the page tables and see changes to vcpu->mode here. The barrier
+	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
+	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+	 *
+	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+	 * guest mode and/or lockless shadow page table walks.
 	 */
 	kvm_flush_remote_tlbs(kvm);
 
-- 
cgit 


From 36ca7e0a577aa61f648953886958e50398732d63 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Sun, 13 Mar 2016 11:10:25 +0800
Subject: KVM/x86: Replace smp_mb() with smp_store_mb/release() in the
 walk_shadow_page_lockless_begin/end()

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fc8ab1f67d00..6bdfbc23ecaa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
 	 */
 	local_irq_disable();
-	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+
 	/*
 	 * Make sure a following spte read is not reordered ahead of the write
 	 * to vcpu->mode.
 	 */
-	smp_mb();
+	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 	 * reads to sptes.  If it does, kvm_commit_zap_page() can see us
 	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 	 */
-	smp_mb();
-	vcpu->mode = OUTSIDE_GUEST_MODE;
+	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 	local_irq_enable();
 }
 
-- 
cgit 


From 7bfdf2177812c30928bea3fc8bc86b9dea236f65 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Sun, 13 Mar 2016 11:10:27 +0800
Subject: KVM/x86: Call smp_wmb() before increasing tlbs_dirty

Update spte before increasing tlbs_dirty to make sure no tlb flush
in lost after spte is zapped. This pairs with the barrier in the
kvm_flush_remote_tlbs().

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e0c225421157..1d971c7553c3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -960,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			return 0;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+			/*
+			 * Update spte before increasing tlbs_dirty to make
+			 * sure no tlb flush is lost after spte is zapped; see
+			 * the comments in kvm_flush_remote_tlbs().
+			 */
+			smp_wmb();
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
@@ -975,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i]);
+			/*
+			 * The same as above where we are doing
+			 * prefetch_invalid_gpte().
+			 */
+			smp_wmb();
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
-- 
cgit 


From 0f127d12e44c13d2f90e5c613490e70ddb1a5e08 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Sun, 13 Mar 2016 11:10:29 +0800
Subject: KVM/x86: update the comment of memory barrier in the
 vcpu_enter_guest()

The barrier also orders the write to mode from any reads
to the page tables done and so update the comment.

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 98ae1fb9fd66..e260ccbc8f55 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6596,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
-	/* We should set ->mode before check ->requests,
-	 * see the comment in make_all_cpus_request.
+	/*
+	 * We should set ->mode before check ->requests,
+	 * Please see the comment in kvm_make_all_cpus_request.
+	 * This also orders the write to mode from any reads
+	 * to the page tables done while the VCPU is running.
+	 * Please see the comment in kvm_flush_remote_tlbs.
 	 */
 	smp_mb__after_srcu_read_unlock();
 
-- 
cgit 


From 9db284f30311b741a5e74eb8e5800ff7b92c4be3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 21 Mar 2016 15:13:27 +0100
Subject: kvm, rt: change async pagefault code locking for PREEMPT_RT

The async pagefault wake code can run from the idle task in exception
context, so everything here needs to be made non-preemptible.

Conversion to a simple wait queue and raw spinlock does the trick.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 47190bd399e7..807950860fb7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -36,6 +36,7 @@
 #include <linux/kprobes.h>
 #include <linux/debugfs.h>
 #include <linux/nmi.h>
+#include <linux/swait.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -91,14 +92,14 @@ static void kvm_io_delay(void)
 
 struct kvm_task_sleep_node {
 	struct hlist_node link;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	u32 token;
 	int cpu;
 	bool halted;
 };
 
 static struct kvm_task_sleep_head {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct hlist_head list;
 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
 
@@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 	struct kvm_task_sleep_node n, *e;
-	DEFINE_WAIT(wait);
+	DECLARE_SWAITQUEUE(wait);
 
 	rcu_irq_enter();
 
-	spin_lock(&b->lock);
+	raw_spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
 	if (e) {
 		/* dummy entry exist -> wake up was delivered ahead of PF */
 		hlist_del(&e->link);
 		kfree(e);
-		spin_unlock(&b->lock);
+		raw_spin_unlock(&b->lock);
 
 		rcu_irq_exit();
 		return;
@@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
 	n.token = token;
 	n.cpu = smp_processor_id();
 	n.halted = is_idle_task(current) || preempt_count() > 1;
-	init_waitqueue_head(&n.wq);
+	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
-	spin_unlock(&b->lock);
+	raw_spin_unlock(&b->lock);
 
 	for (;;) {
 		if (!n.halted)
-			prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+			prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
 
@@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
 		}
 	}
 	if (!n.halted)
-		finish_wait(&n.wq, &wait);
+		finish_swait(&n.wq, &wait);
 
 	rcu_irq_exit();
 	return;
@@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 	hlist_del_init(&n->link);
 	if (n->halted)
 		smp_send_reschedule(n->cpu);
-	else if (waitqueue_active(&n->wq))
-		wake_up(&n->wq);
+	else if (swait_active(&n->wq))
+		swake_up(&n->wq);
 }
 
 static void apf_task_wake_all(void)
@@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
 		struct hlist_node *p, *next;
 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
-		spin_lock(&b->lock);
+		raw_spin_lock(&b->lock);
 		hlist_for_each_safe(p, next, &b->list) {
 			struct kvm_task_sleep_node *n =
 				hlist_entry(p, typeof(*n), link);
 			if (n->cpu == smp_processor_id())
 				apf_task_wake_one(n);
 		}
-		spin_unlock(&b->lock);
+		raw_spin_unlock(&b->lock);
 	}
 }
 
@@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
 	}
 
 again:
-	spin_lock(&b->lock);
+	raw_spin_lock(&b->lock);
 	n = _find_apf_task(b, token);
 	if (!n) {
 		/*
@@ -225,17 +226,17 @@ again:
 			 * Allocation failed! Busy wait while other cpu
 			 * handles async PF.
 			 */
-			spin_unlock(&b->lock);
+			raw_spin_unlock(&b->lock);
 			cpu_relax();
 			goto again;
 		}
 		n->token = token;
 		n->cpu = smp_processor_id();
-		init_waitqueue_head(&n->wq);
+		init_swait_queue_head(&n->wq);
 		hlist_add_head(&n->link, &b->list);
 	} else
 		apf_task_wake_one(n);
-	spin_unlock(&b->lock);
+	raw_spin_unlock(&b->lock);
 	return;
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
 	paravirt_ops_setup();
 	register_reboot_notifier(&kvm_pv_reboot_nb);
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
-		spin_lock_init(&async_pf_sleepers[i].lock);
+		raw_spin_lock_init(&async_pf_sleepers[i].lock);
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
-- 
cgit 


From a6adb106225f9e2f177d3d883596e011df321965 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Mar 2016 17:25:42 +0100
Subject: KVM: page_track: fix access to NULL slot

This happens when doing the reboot test from virt-tests:

[  131.833653] BUG: unable to handle kernel NULL pointer dereference at           (null)
[  131.842461] IP: [<ffffffffa0950087>] kvm_page_track_is_active+0x17/0x60 [kvm]
[  131.850500] PGD 0
[  131.852763] Oops: 0000 [#1] SMP
[  132.007188] task: ffff880075fbc500 ti: ffff880850a3c000 task.ti: ffff880850a3c000
[  132.138891] Call Trace:
[  132.141639]  [<ffffffffa092bd11>] page_fault_handle_page_track+0x31/0x40 [kvm]
[  132.149732]  [<ffffffffa093380f>] paging64_page_fault+0xff/0x910 [kvm]
[  132.172159]  [<ffffffffa092c734>] kvm_mmu_page_fault+0x64/0x110 [kvm]
[  132.179372]  [<ffffffffa06743c2>] handle_exception+0x1b2/0x430 [kvm_intel]
[  132.187072]  [<ffffffffa067a301>] vmx_handle_exit+0x1e1/0xc50 [kvm_intel]
...

Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Fixes: 3d0c27ad6ee465f174b09ee99fcaf189c57d567a
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/page_track.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 11f76436f74f..b431539c3714 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
 bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 			      enum kvm_page_track_mode mode)
 {
-	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+	struct kvm_memory_slot *slot;
+	int index;
 
 	if (WARN_ON(!page_track_mode_is_valid(mode)))
 		return false;
 
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (!slot)
+		return false;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
 	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
-- 
cgit 


From f970165beeaa4803438e435d68022b3680a1a0b0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 22 Mar 2016 14:25:27 -0700
Subject: x86/compat: remove is_compat_task()

x86's is_compat_task always checked the current syscall type, not the
task type.  It has no non-arch users any more, so just remove it to
avoid confusion.

On x86, nothing should really be checking the task ABI.  There are
legitimate users for the syscall ABI and for the mm ABI.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/compat.h | 3 ++-
 arch/x86/include/asm/ftrace.h | 2 +-
 arch/x86/kernel/process_64.c  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index acdee09228b3..ebb102e1bbc7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -316,9 +316,10 @@ static inline bool is_x32_task(void)
 	return false;
 }
 
-static inline bool is_compat_task(void)
+static inline bool in_compat_syscall(void)
 {
 	return is_ia32_task() || is_x32_task();
 }
+#define in_compat_syscall in_compat_syscall	/* override the generic impl */
 
 #endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 24938852db30..21b66dbf3601 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -58,7 +58,7 @@ int ftrace_int3_handler(struct pt_regs *regs);
 #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
 {
-	if (is_compat_task())
+	if (in_compat_syscall())
 		return true;
 	return false;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 776229e98202..dfa2781610e8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -478,7 +478,7 @@ void set_personality_ia32(bool x32)
 		if (current->mm)
 			current->mm->context.ia32_compat = TIF_X32;
 		current->personality &= ~READ_IMPLIES_EXEC;
-		/* is_compat_task() uses the presence of the x32
+		/* in_compat_syscall() uses the presence of the x32
 		   syscall bit flag to determine compat status */
 		current_thread_info()->status &= ~TS_COMPAT;
 	} else {
-- 
cgit 


From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 22 Mar 2016 14:27:30 -0700
Subject: kernel: add kcov code coverage

kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing).  Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system.  A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/).  However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.

kcov does not aim to collect as much coverage as possible.  It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g.  scheduler, locking).

Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes.  Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch).  I've
dropped the second mode for simplicity.

This patch adds the necessary support on kernel side.  The complimentary
compiler support was added in gcc revision 231296.

We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:

  https://github.com/google/syzkaller/wiki/Found-Bugs

We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation".  For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.

Why not gcov.  Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat.  A
typical coverage can be just a dozen of basic blocks (e.g.  an invalid
input).  In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M).  Cost of
kcov depends only on number of executed basic blocks/edges.  On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.

kcov exposes kernel PCs and control flow to user-space which is
insecure.  But debugfs should not be mapped as user accessible.

Based on a patch by Quentin Casasnovas.

[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                  | 1 +
 arch/x86/boot/Makefile            | 7 +++++++
 arch/x86/boot/compressed/Makefile | 3 +++
 arch/x86/entry/vdso/Makefile      | 3 +++
 arch/x86/kernel/Makefile          | 6 ++++++
 arch/x86/kernel/apic/Makefile     | 4 ++++
 arch/x86/kernel/cpu/Makefile      | 4 ++++
 arch/x86/lib/Makefile             | 3 +++
 arch/x86/mm/Makefile              | 3 +++
 arch/x86/realmode/rm/Makefile     | 3 +++
 10 files changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b680a5cb25b..54478b7635de 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0bf6749522d9..b1ef9e489084 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -12,6 +12,13 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5e1d26e09407..6915ff2bd996 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index f9fb859c98b9..6874da5f67fc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -7,6 +7,9 @@ KASAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d5fb0871aba3..adaae2c781c1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o vector.o
 obj-y				+= hw_nmi.o
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0d373d7affc8..4a8697f7d4ef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..72a576752a7e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
 # Makefile for x86 specific library files.
 #
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 67cf2e1e557b..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 053abe7b0ef7..b95964610ea7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -9,6 +9,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 always := realmode.bin realmode.relocs
 
 wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
-- 
cgit 


From 29934b0fb8ff5afa61832ea56aec65befab30511 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 22 Mar 2016 14:28:17 -0700
Subject: x86/extable: use generic search and sort routines

Replace the arch specific versions of search_extable() and
sort_extable() with calls to the generic ones, which now support
relative exception tables as well.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h |   5 +-
 arch/x86/mm/extable.c          | 108 -----------------------------------------
 2 files changed, 2 insertions(+), 111 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 88bff6dd23ad..a969ae607be8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -105,9 +105,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 struct exception_table_entry {
 	int insn, fixup, handler;
 };
-/* This is not the generic standard exception_table_entry format */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+
+#define ARCH_HAS_RELATIVE_EXTABLE
 
 extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern bool ex_has_fault_handler(unsigned long ip);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 9dd7e4b7fcde..82447b3fba38 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,16 +1,9 @@
 #include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sort.h>
 #include <asm/uaccess.h>
 
 typedef bool (*ex_handler_t)(const struct exception_table_entry *,
 			    struct pt_regs *, int);
 
-static inline unsigned long
-ex_insn_addr(const struct exception_table_entry *x)
-{
-	return (unsigned long)&x->insn + x->insn;
-}
 static inline unsigned long
 ex_fixup_addr(const struct exception_table_entry *x)
 {
@@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip)
 	*ip = new_ip;
 	return 1;
 }
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
-	       unsigned long value)
-{
-	while (first <= last) {
-		const struct exception_table_entry *mid;
-		unsigned long addr;
-
-		mid = ((last - first) >> 1) + first;
-		addr = ex_insn_addr(mid);
-		if (addr < value)
-			first = mid + 1;
-		else if (addr > value)
-			last = mid - 1;
-		else
-			return mid;
-        }
-        return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
-	const struct exception_table_entry *x = a, *y = b;
-
-	/*
-	 * This value will always end up fittin in an int, because on
-	 * both i386 and x86-64 the kernel symbol-reachable address
-	 * space is < 2 GiB.
-	 *
-	 * This compare is only valid after normalization.
-	 */
-	return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
-		  struct exception_table_entry *finish)
-{
-	struct exception_table_entry *p;
-	int i;
-
-	/* Convert all entries to being relative to the start of the section */
-	i = 0;
-	for (p = start; p < finish; p++) {
-		p->insn += i;
-		i += 4;
-		p->fixup += i;
-		i += 4;
-		p->handler += i;
-		i += 4;
-	}
-
-	sort(start, finish - start, sizeof(struct exception_table_entry),
-	     cmp_ex, NULL);
-
-	/* Denormalize all entries */
-	i = 0;
-	for (p = start; p < finish; p++) {
-		p->insn -= i;
-		i += 4;
-		p->fixup -= i;
-		i += 4;
-		p->handler -= i;
-		i += 4;
-	}
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
-	/*trim the beginning*/
-	while (m->num_exentries &&
-	       within_module_init(ex_insn_addr(&m->extable[0]), m)) {
-		m->extable++;
-		m->num_exentries--;
-	}
-	/*trim the end*/
-	while (m->num_exentries &&
-	       within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
-		m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
-- 
cgit 


From 9da77666d6975219281fd400eb9608a047337414 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Tue, 22 Mar 2016 19:06:08 -0400
Subject: x86/msr: Remove unused native_read_tscp()

After e76b027 ("x86,vdso: Use LSL unconditionally for vgetcpu")
native_read_tscp() is unused in the kernel. The function can be removed like
native_read_tsc() was.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1458687968-9106-1-git-send-email-prarit@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msr.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 93fb7c1cffda..7a79ee2778b3 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -42,14 +42,6 @@ struct saved_msrs {
 	struct saved_msr *array;
 };
 
-static inline unsigned long long native_read_tscp(unsigned int *aux)
-{
-	unsigned long low, high;
-	asm volatile(".byte 0x0f,0x01,0xf9"
-		     : "=a" (low), "=d" (high), "=c" (*aux));
-	return low | ((u64)high << 32);
-}
-
 /*
  * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
  * constraint has different meanings. For i386, "A" means exactly
-- 
cgit 


From be7635e7287e0e8013af3c89a6354a9e0182594c Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:05 -0700
Subject: arch, ftrace: for KASAN put hard/soft IRQ entries into separate
 sections

KASAN needs to know whether the allocation happens in an IRQ handler.
This lets us strip everything below the IRQ entry point to reduce the
number of unique stack traces needed to be stored.

Move the definition of __irq_entry to <linux/interrupt.h> so that the
users don't need to pull in <linux/ftrace.h>.  Also introduce the
__softirq_entry macro which is similar to __irq_entry, but puts the
corresponding functions to the .softirqentry.text section.

Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d239639e0c1d..4c941f88d405 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
 		KPROBES_TEXT
 		ENTRY_TEXT
 		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
 		/* End of text section */
-- 
cgit 


From cd11016e5f5212c13c0cec7384a525edc93b4921 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 25 Mar 2016 14:22:08 -0700
Subject: mm, kasan: stackdepot implementation. Enable stackdepot for SLAB

Implement the stack depot and provide CONFIG_STACKDEPOT.  Stack depot
will allow KASAN store allocation/deallocation stack traces for memory
chunks.  The stack traces are stored in a hash table and referenced by
handles which reside in the kasan_alloc_meta and kasan_free_meta
structures in the allocated memory chunks.

IRQ stack traces are cut below the IRQ entry point to avoid unnecessary
duplication.

Right now stackdepot support is only enabled in SLAB allocator.  Once
KASAN features in SLAB are on par with those in SLUB we can switch SLUB
to stackdepot as well, thus removing the dependency on SLUB stack
bookkeeping, which wastes a lot of memory.

This patch is based on the "mm: kasan: stack depots" patch originally
prepared by Dmitry Chernenkov.

Joonsoo has said that he plans to reuse the stackdepot code for the
mm/page_owner.c debugging facility.

[akpm@linux-foundation.org: s/depot_stack_handle/depot_stack_handle_t]
[aryabinin@virtuozzo.com: comment style fixes]
Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index adaae2c781c1..616ebd22ef9a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -19,6 +19,7 @@ endif
 KASAN_SANITIZE_head$(BITS).o				:= n
 KASAN_SANITIZE_dumpstack.o				:= n
 KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
+KASAN_SANITIZE_stacktrace.o := n
 
 OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
-- 
cgit