From 2406e3b166eee42777a6b0b38f52f924454474d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Sep 2017 21:36:56 +0200
Subject: perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround

The lockup_detector_suspend/resume() interface is broken in several ways
especially as it results in recursive locking of the CPU hotplug lock.

Use the new stop/restart interface in the perf NMI watchdog to temporarily
disable and reenable the already active watchdog events. That's enough to
handle it.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.247141871@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 829e89cfcee2..9fb9a1f1e47b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	if (lockup_detector_suspend() != 0) {
-		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-		return 0;
-	}
+	cpus_read_lock();
+
+	hardlockup_detector_perf_stop();
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	lockup_detector_resume();
-
-	cpus_read_lock();
+	hardlockup_detector_perf_restart();
 
 	for_each_online_cpu(c)
 		free_excl_cntrs(c);
-- 
cgit 


From 51a9a8284e43642fc3e85810fd54f4c245d23a14 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 18 Sep 2017 10:03:12 +0100
Subject: x86/xen: clean up clang build warning

In the case where sizeof(maddr) != sizeof(long) p is initialized and
never read and clang throws a warning on this.  Move declaration of
p to clean up the clang build warning:

warning: Value stored to 'p' during its initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/hypercall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9606688caa4b..e089c1675a7c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,13 +552,13 @@ static inline void
 MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
 			struct desc_struct desc)
 {
-	u32 *p = (u32 *) &desc;
-
 	mcl->op = __HYPERVISOR_update_descriptor;
 	if (sizeof(maddr) == sizeof(long)) {
 		mcl->args[0] = maddr;
 		mcl->args[1] = *(unsigned long *)&desc;
 	} else {
+		u32 *p = (u32 *)&desc;
+
 		mcl->args[0] = maddr;
 		mcl->args[1] = maddr >> 32;
 		mcl->args[2] = *p++;
-- 
cgit 


From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Fri, 22 Sep 2017 07:53:15 +0200
Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache

For nested virt we maintain multiple VMCS that can run on a vCPU. So it is
incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching
the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in
vCPU-wide data structures.

Hyper-V nested on KVM runs into this consistently for me with PCID enabled.
CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3)
fires, and the currently loaded VMCS is updated. Then we switch from L2 to
L1 and the next exit reverts CR3 to its old value.

Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant")
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0726ca7a1b02..c83d28b0ab05 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -200,6 +200,8 @@ struct loaded_vmcs {
 	int cpu;
 	bool launched;
 	bool nmi_known_unmasked;
+	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
+	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -600,8 +602,6 @@ struct vcpu_vmx {
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
 		u64           msr_host_bndcfgs;
-		unsigned long vmcs_host_cr3;	/* May not match real cr3 */
-		unsigned long vmcs_host_cr4;	/* May not match real cr4 */
 	} host_state;
 	struct {
 		int vm86_active;
@@ -5178,12 +5178,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 */
 	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
-	vmx->host_state.vmcs_host_cr3 = cr3;
+	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
 	cr4 = cr4_read_shadow();
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
-	vmx->host_state.vmcs_host_cr4 = cr4;
+	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
@@ -9274,15 +9274,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
 	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
 		vmcs_writel(HOST_CR3, cr3);
-		vmx->host_state.vmcs_host_cr3 = cr3;
+		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 	}
 
 	cr4 = cr4_read_shadow();
-	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
-		vmx->host_state.vmcs_host_cr4 = cr4;
+		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 	}
 
 	/* When single-stepping over STI and MOV SS, we must clear the
-- 
cgit 


From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Mon, 4 Sep 2017 10:32:15 +0200
Subject: x86/mm: Fix fault error path using unsafe vma pointer

commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
generation code") passes down a vma pointer to the error path, but that is
done once the mmap_sem is released when calling mm_fault_error() from
__do_page_fault().

This is dangerous as the vma structure is no more safe to be used once the
mmap_sem has been released. As only the protection key value is required in
the error processing, we could just pass down this value.

Fix it by passing a pointer to a protection key value down to the fault
signal generation code. The use of a pointer allows to keep the check
generating a warning message in fill_sig_info_pkey() when the vma was not
known. If the pointer is valid, the protection value can be accessed by
deferencing the pointer.

[ tglx: Made *pkey u32 as that's the type which is passed in siginfo ]

Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code")
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com
---
 arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 39567b5c33da..e2baeaa053a5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
  *	     faulted on a pte with its pkey=4.
  */
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
-		struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 {
 	/* This is effectively an #ifdef */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
-	if (!vma) {
+	if (!pkey) {
 		WARN_ONCE(1, "PKU fault with no VMA passed in");
 		info->si_pkey = 0;
 		return;
@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
 	 * absolutely guranteed to be 100% accurate because of
 	 * the race explained above.
 	 */
-	info->si_pkey = vma_pkey(vma);
+	info->si_pkey = *pkey;
 }
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk, struct vm_area_struct *vma,
-		     int fault)
+		     struct task_struct *tsk, u32 *pkey, int fault)
 {
 	unsigned lsb = 0;
 	siginfo_t info;
@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 		lsb = PAGE_SHIFT;
 	info.si_addr_lsb = lsb;
 
-	fill_sig_info_pkey(si_code, &info, vma);
+	fill_sig_info_pkey(si_code, &info, pkey);
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 	unsigned long flags;
 	int sig;
-	/* No context means no VMA to pass down */
-	struct vm_area_struct *vma = NULL;
 
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
 			/* XXX: hwpoison faults will set the wrong code. */
 			force_sig_info_fault(signal, si_code, address,
-					     tsk, vma, 0);
+					     tsk, NULL, 0);
 		}
 
 		/*
@@ -896,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		       unsigned long address, struct vm_area_struct *vma,
-		       int si_code)
+		       unsigned long address, u32 *pkey, int si_code)
 {
 	struct task_struct *tsk = current;
 
@@ -945,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.error_code	= error_code;
 		tsk->thread.trap_nr	= X86_TRAP_PF;
 
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 
 		return;
 	}
@@ -958,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-		     unsigned long address, struct vm_area_struct *vma)
+		     unsigned long address, u32 *pkey)
 {
-	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+	__bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
 }
 
 static void
@@ -968,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	   unsigned long address,  struct vm_area_struct *vma, int si_code)
 {
 	struct mm_struct *mm = current->mm;
+	u32 pkey;
+
+	if (vma)
+		pkey = vma_pkey(vma);
 
 	/*
 	 * Something tried to access memory that isn't in our memory map..
@@ -975,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 	 */
 	up_read(&mm->mmap_sem);
 
-	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
+	__bad_area_nosemaphore(regs, error_code, address,
+			       (vma) ? &pkey : NULL, si_code);
 }
 
 static noinline void
@@ -1018,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
-	  struct vm_area_struct *vma, unsigned int fault)
+	  u32 *pkey, unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	int code = BUS_ADRERR;
@@ -1045,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
+	force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
 }
 
 static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-	       unsigned long address, struct vm_area_struct *vma,
-	       unsigned int fault)
+	       unsigned long address, u32 *pkey, unsigned int fault)
 {
 	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
@@ -1075,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
-			do_sigbus(regs, error_code, address, vma, fault);
+			do_sigbus(regs, error_code, address, pkey, fault);
 		else if (fault & VM_FAULT_SIGSEGV)
-			bad_area_nosemaphore(regs, error_code, address, vma);
+			bad_area_nosemaphore(regs, error_code, address, pkey);
 		else
 			BUG();
 	}
@@ -1267,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct mm_struct *mm;
 	int fault, major = 0;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	u32 pkey;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1467,9 +1467,10 @@ good_area:
 		return;
 	}
 
+	pkey = vma_pkey(vma);
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, error_code, address, vma, fault);
+		mm_fault_error(regs, error_code, address, &pkey, fault);
 		return;
 	}
 
-- 
cgit 


From 7d7099433d9eaaa5a989a55f1fa354c16a3ad297 Mon Sep 17 00:00:00 2001
From: Sean Fu <fxinrong@gmail.com>
Date: Mon, 11 Sep 2017 08:33:21 +0800
Subject: x86/sysfs: Fix off-by-one error in loop termination

An off-by-one error in loop terminantion conditions in
create_setup_data_nodes() will lead to memory leak when
create_setup_data_node() failed.

Signed-off-by: Sean Fu <fxinrong@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1505090001-1157-1-git-send-email-fxinrong@gmail.com
---
 arch/x86/kernel/ksysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4b0592ca9e47..8c1cc08f514f 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent)
 	return 0;
 
 out_clean_nodes:
-	for (j = i - 1; j > 0; j--)
+	for (j = i - 1; j >= 0; j--)
 		cleanup_setup_data_node(*(kobjp + j));
 	kfree(kobjp);
 out_setup_data_kobj:
-- 
cgit 


From 5ac751d9e6b187c4a0000879d6598eb2292db949 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Tue, 12 Sep 2017 19:40:00 +0300
Subject: x86: Don't cast away the __user in __get_user_asm_u64()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't cast away the __user in __get_user_asm_u64() on x86-32.
Prevents sparse getting upset.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20170912164000.13745-1-ville.syrjala@linux.intel.com
---
 arch/x86/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 78e8fcc87d4c..4b892917edeb 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -337,7 +337,7 @@ do {									\
 		     _ASM_EXTABLE(1b, 4b)				\
 		     _ASM_EXTABLE(2b, 4b)				\
 		     : "=r" (retval), "=&A"(x)				\
-		     : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1),	\
+		     : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1),	\
 		       "i" (errret), "0" (retval));			\
 })
 
-- 
cgit 


From b09c146f8f63c0e03adba74df76bf9c2be466fec Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:47 -0400
Subject: perf/x86/intel/cstate: Add missing CPU IDs

Skylake server uses the same C-state residency events as Sandy Bridge.

Denverton and Gemini lake use the same C-state residency events as
Apollo Lake.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-1-kan.liang@intel.com
---
 arch/x86/events/intel/cstate.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 4cf100ff2a37..72db0664a53d 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE,  snb_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates),
 
 	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE,  snb_cstates),
 	X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates),
@@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
 	X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
 
 	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates),
+
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates),
 	{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
-- 
cgit 


From 1aaccc40a1864053da26605b0297be16dd52641e Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:48 -0400
Subject: perf/x86/msr: Add missing CPU IDs

Goldmont, Glodmont plus and Xeon Phi have MSR_SMI_COUNT as well.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-2-kan.liang@intel.com
---
 arch/x86/events/msr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 4bb3ec69e8ea..06723671ae4e 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -63,6 +63,14 @@ static bool test_intel(int idx)
 	case INTEL_FAM6_ATOM_SILVERMONT1:
 	case INTEL_FAM6_ATOM_SILVERMONT2:
 	case INTEL_FAM6_ATOM_AIRMONT:
+
+	case INTEL_FAM6_ATOM_GOLDMONT:
+	case INTEL_FAM6_ATOM_DENVERTON:
+
+	case INTEL_FAM6_ATOM_GEMINI_LAKE:
+
+	case INTEL_FAM6_XEON_PHI_KNL:
+	case INTEL_FAM6_XEON_PHI_KNM:
 		if (idx == PERF_MSR_SMI)
 			return true;
 		break;
-- 
cgit 


From 450a97893559354b927c935f39ee11126f01f520 Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Fri, 8 Sep 2017 17:34:49 -0400
Subject: perf/x86/intel/rapl: Add missing CPU IDs

DENVERTON and GEMINI_LAKE support same RAPL counters as Apollo Lake.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: piotr.luc@intel.com
Cc: harry.pan@intel.com
Cc: srinivas.pandruvada@linux.intel.com
Link: http://lkml.kernel.org/r/20170908213449.6224-3-kan.liang@intel.com
---
 arch/x86/events/intel/rapl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 8e2457cb6b4a..005908ee9333 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
 
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init),
+
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init),
 	{},
 };
 
-- 
cgit 


From 29b46dfb136cdbeece542b3f01115237e43f2855 Mon Sep 17 00:00:00 2001
From: Kan Liang <Kan.liang@intel.com>
Date: Mon, 11 Sep 2017 10:10:15 -0700
Subject: perf/x86/intel/uncore: Correct num_boxes for IIO and IRP

There are 6 IIO/IRP boxes for CBDMA, PCIe0-2, MCP 0 and MCP 1
separately. Correct the num_boxes.

Signed-off-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1505149816-12580-1-git-send-email-kan.liang@intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index db1fe377e6dd..a7196818416a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 static struct intel_uncore_type skx_uncore_iio = {
 	.name			= "iio",
 	.num_counters		= 4,
-	.num_boxes		= 5,
+	.num_boxes		= 6,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= SKX_IIO0_MSR_PMON_CTL0,
 	.perf_ctr		= SKX_IIO0_MSR_PMON_CTR0,
@@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = {
 static struct intel_uncore_type skx_uncore_irp = {
 	.name			= "irp",
 	.num_counters		= 2,
-	.num_boxes		= 5,
+	.num_boxes		= 6,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= SKX_IRP0_MSR_PMON_CTL0,
 	.perf_ctr		= SKX_IRP0_MSR_PMON_CTR0,
-- 
cgit 


From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:04 +0200
Subject: KVM: VMX: extract __pi_post_block
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simple code movement patch, preparing for the next one.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c83d28b0ab05..0002b14307ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11705,6 +11705,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+	unsigned long flags;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	if(vcpu->pre_pcpu != -1) {
+		spin_lock_irqsave(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock_irqrestore(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		vcpu->pre_pcpu = -1;
+	}
+}
+
 /*
  * This routine does the following things for vCPU which is going
  * to be blocked if VT-d PI is enabled.
@@ -11798,44 +11835,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-	unsigned long flags;
-
 	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
 		!kvm_vcpu_apicv_active(vcpu))
 		return;
 
-	do {
-		old.control = new.control = pi_desc->control;
-
-		dest = cpu_physical_id(vcpu->cpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		/* Allow posting non-urgent interrupts */
-		new.sn = 0;
-
-		/* set 'NV' to 'notification vector' */
-		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
-
-	if(vcpu->pre_pcpu != -1) {
-		spin_lock_irqsave(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock_irqrestore(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
-		vcpu->pre_pcpu = -1;
-	}
+	__pi_post_block(vcpu);
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
-- 
cgit 


From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:05 +0200
Subject: KVM: VMX: avoid double list add with VT-d posted interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, for example involving hot-unplug of assigned
devices, pi_post_block can forget to remove the vCPU from the
blocked_vcpu_list.  When this happens, the next call to
pi_pre_block corrupts the list.

Fix this in two ways.  First, check vcpu->pre_pcpu in pi_pre_block
and WARN instead of adding the element twice in the list.  Second,
always do the list removal in pi_post_block if vcpu->pre_pcpu is
set (not -1).

The new code keeps interrupts disabled for the whole duration of
pi_pre_block/pi_post_block.  This is not strictly necessary, but
easier to follow.  For the same reason, PI.ON is checked only
after the cmpxchg, and to handle it we just call the post-block
code.  This removes duplication of the list removal code.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0002b14307ab..0bfe97e50a40 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11710,10 +11710,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
 	unsigned int dest;
-	unsigned long flags;
 
 	do {
 		old.control = new.control = pi_desc->control;
+		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+		     "Wakeup handler not enabled while the VCPU is blocked\n");
 
 		dest = cpu_physical_id(vcpu->cpu);
 
@@ -11730,14 +11731,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 
-	if(vcpu->pre_pcpu != -1) {
-		spin_lock_irqsave(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock_irqrestore(
-			&per_cpu(blocked_vcpu_on_cpu_lock,
-			vcpu->pre_pcpu), flags);
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
 		vcpu->pre_pcpu = -1;
 	}
 }
@@ -11757,7 +11754,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
  */
 static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
-	unsigned long flags;
 	unsigned int dest;
 	struct pi_desc old, new;
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11767,34 +11763,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 		!kvm_vcpu_apicv_active(vcpu))
 		return 0;
 
-	vcpu->pre_pcpu = vcpu->cpu;
-	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-			  vcpu->pre_pcpu), flags);
-	list_add_tail(&vcpu->blocked_vcpu_list,
-		      &per_cpu(blocked_vcpu_on_cpu,
-		      vcpu->pre_pcpu));
-	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
-			       vcpu->pre_pcpu), flags);
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
+	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+		vcpu->pre_pcpu = vcpu->cpu;
+		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+		list_add_tail(&vcpu->blocked_vcpu_list,
+			      &per_cpu(blocked_vcpu_on_cpu,
+				       vcpu->pre_pcpu));
+		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+	}
 
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * We should not block the vCPU if
-		 * an interrupt is posted for it.
-		 */
-		if (pi_test_on(pi_desc) == 1) {
-			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-					  vcpu->pre_pcpu), flags);
-			list_del(&vcpu->blocked_vcpu_list);
-			spin_unlock_irqrestore(
-					&per_cpu(blocked_vcpu_on_cpu_lock,
-					vcpu->pre_pcpu), flags);
-			vcpu->pre_pcpu = -1;
-
-			return 1;
-		}
-
 		WARN((pi_desc->sn == 1),
 		     "Warning: SN field of posted-interrupts "
 		     "is set before blocking\n");
@@ -11819,7 +11801,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 
-	return 0;
+	/* We should not block the vCPU if an interrupt is posted for it.  */
+	if (pi_test_on(pi_desc) == 1)
+		__pi_post_block(vcpu);
+
+	local_irq_enable();
+	return (vcpu->pre_pcpu == -1);
 }
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11835,12 +11822,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	if (vcpu->pre_pcpu == -1)
 		return;
 
+	WARN_ON(irqs_disabled());
+	local_irq_disable();
 	__pi_post_block(vcpu);
+	local_irq_enable();
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
-- 
cgit 


From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Jun 2017 12:57:06 +0200
Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The simplify part: do not touch pi_desc.nv, we can set it when the
VCPU is first created.  Likewise, pi_desc.sn is only handled by
vmx_vcpu_pi_load, do not touch it in __pi_post_block.

The fix part: do not check kvm_arch_has_assigned_device, instead
check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
This matches what the previous patch did in pi_post_block.

Cc: Huangweidong <weidong.huang@huawei.com>
Cc: Gonglei <arei.gonglei@huawei.com>
Cc: wangxin <wangxinxin.wang@huawei.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0bfe97e50a40..b9d2140eb212 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2202,43 +2202,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	struct pi_desc old, new;
 	unsigned int dest;
 
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
+	/*
+	 * In case of hot-plug or hot-unplug, we may have to undo
+	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
+	 * always keep PI.NDST up to date for simplicity: it makes the
+	 * code easier, and CPU migration is not a fast path.
+	 */
+	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+		return;
+
+	/*
+	 * First handle the simple case where no cmpxchg is necessary; just
+	 * allow posting non-urgent interrupts.
+	 *
+	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+	 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+	 * expects the VCPU to be on the blocked_vcpu_list that matches
+	 * PI.NDST.
+	 */
+	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+	    vcpu->cpu == cpu) {
+		pi_clear_sn(pi_desc);
 		return;
+	}
 
+	/* The full case.  */
 	do {
 		old.control = new.control = pi_desc->control;
 
-		/*
-		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
-		 * are two possible cases:
-		 * 1. After running 'pre_block', context switch
-		 *    happened. For this case, 'sn' was set in
-		 *    vmx_vcpu_put(), so we need to clear it here.
-		 * 2. After running 'pre_block', we were blocked,
-		 *    and woken up by some other guy. For this case,
-		 *    we don't need to do anything, 'pi_post_block'
-		 *    will do everything for us. However, we cannot
-		 *    check whether it is case #1 or case #2 here
-		 *    (maybe, not needed), so we also clear sn here,
-		 *    I think it is not a big deal.
-		 */
-		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
-			if (vcpu->cpu != cpu) {
-				dest = cpu_physical_id(cpu);
-
-				if (x2apic_enabled())
-					new.ndst = dest;
-				else
-					new.ndst = (dest << 8) & 0xFF00;
-			}
+		dest = cpu_physical_id(cpu);
 
-			/* set 'NV' to 'notification vector' */
-			new.nv = POSTED_INTR_VECTOR;
-		}
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
 
-		/* Allow posting non-urgent interrupts */
 		new.sn = 0;
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
@@ -9592,6 +9590,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
+	/*
+	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+	 * or POSTED_INTR_WAKEUP_VECTOR.
+	 */
+	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+	vmx->pi_desc.sn = 1;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -11723,9 +11728,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 		else
 			new.ndst = (dest << 8) & 0xFF00;
 
-		/* Allow posting non-urgent interrupts */
-		new.sn = 0;
-
 		/* set 'NV' to 'notification vector' */
 		new.nv = POSTED_INTR_VECTOR;
 	} while (cmpxchg(&pi_desc->control, old.control,
-- 
cgit 


From 0d805ee70a69eabd38160dc199e183ac2f13fe4b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Wed, 27 Sep 2017 02:41:25 -0700
Subject: xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables
 mapping

When bootup a PVM guest with large memory(Ex.240GB), XEN provided initial
mapping overlaps with kernel module virtual space. When mapping in this space
is cleared by xen_cleanhighmap(), in certain case there could be an 2MB mapping
left. This is due to XEN initialize 4MB aligned mapping but xen_cleanhighmap()
finish at 2MB boundary.

When module loading is just on top of the 2MB space, got below warning:

WARNING: at mm/vmalloc.c:106 vmap_pte_range+0x14e/0x190()
Call Trace:
 [<ffffffff81117083>] warn_alloc_failed+0xf3/0x160
 [<ffffffff81146022>] __vmalloc_area_node+0x182/0x1c0
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff81145df7>] __vmalloc_node_range+0xa7/0x110
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff8103ca54>] module_alloc+0x64/0x70
 [<ffffffff810ac91e>] ? module_alloc_update_bounds+0x1e/0x80
 [<ffffffff810ac91e>] module_alloc_update_bounds+0x1e/0x80
 [<ffffffff810ac9a7>] move_module+0x27/0x150
 [<ffffffff810aefa0>] layout_and_allocate+0x120/0x1b0
 [<ffffffff810af0a8>] load_module+0x78/0x640
 [<ffffffff811ff90b>] ? security_file_permission+0x8b/0x90
 [<ffffffff810af6d2>] sys_init_module+0x62/0x1e0
 [<ffffffff815154c2>] system_call_fastpath+0x16/0x1b

Then the mapping of 2MB is cleared, finally oops when the page in that space is
accessed.

BUG: unable to handle kernel paging request at ffff880022600000
IP: [<ffffffff81260877>] clear_page_c_e+0x7/0x10
PGD 1788067 PUD 178c067 PMD 22434067 PTE 0
Oops: 0002 [#1] SMP
Call Trace:
 [<ffffffff81116ef7>] ? prep_new_page+0x127/0x1c0
 [<ffffffff81117d42>] get_page_from_freelist+0x1e2/0x550
 [<ffffffff81133010>] ? ii_iovec_copy_to_user+0x90/0x140
 [<ffffffff81119c9d>] __alloc_pages_nodemask+0x12d/0x230
 [<ffffffff81155516>] alloc_pages_vma+0xc6/0x1a0
 [<ffffffff81006ffd>] ? pte_mfn_to_pfn+0x7d/0x100
 [<ffffffff81134cfb>] do_anonymous_page+0x16b/0x350
 [<ffffffff81139c34>] handle_pte_fault+0x1e4/0x200
 [<ffffffff8100712e>] ? xen_pmd_val+0xe/0x10
 [<ffffffff810052c9>] ? __raw_callee_save_xen_pmd_val+0x11/0x1e
 [<ffffffff81139dab>] handle_mm_fault+0x15b/0x270
 [<ffffffff81510c10>] do_page_fault+0x140/0x470
 [<ffffffff8150d7d5>] page_fault+0x25/0x30

Call xen_cleanhighmap() with 4MB aligned for page tables mapping to fix it.
The unnecessory call of xen_cleanhighmap() in DEBUG mode is also removed.

-v2: add comment about XEN alignment from Juergen.

References: https://lists.xen.org/archives/html/xen-devel/2012-07/msg01562.html
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>

[boris: added 'xen/mmu' tag to commit subject]
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 509f560bd0c6..58b09fcadbaa 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void)
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
 	 * the ramdisk). We continue on, erasing PMD entries that point to page
 	 * tables - do note that they are accessible at this stage via __va.
-	 * For good measure we also round up to the PMD - which means that if
+	 * As Xen is aligning the memory end to a 4MB boundary, for good
+	 * measure we also round up to PMD_SIZE * 2 - which means that if
 	 * anybody is using __ka address to the initial boot-stack - and try
 	 * to use it - they are going to crash. The xen_start_info has been
 	 * taken care of already in xen_setup_kernel_pagetable. */
 	addr = xen_start_info->pt_base;
-	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
+	size = xen_start_info->nr_pt_frames * PAGE_SIZE;
 
-	xen_cleanhighmap(addr, addr + size);
+	xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
 	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
-#ifdef DEBUG
-	/* This is superfluous and is not necessary, but you know what
-	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
-	 * anything at this stage. */
-	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
-#endif
 }
 #endif
 
-- 
cgit 


From c0a1666bcb2a33e84187a15eabdcd54056be9a97 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 28 Sep 2017 17:58:41 +0200
Subject: KVM: VMX: use cmpxchg64

This fixes a compilation failure on 32-bit systems.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b9d2140eb212..7f62c94196d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2238,8 +2238,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 			new.ndst = (dest << 8) & 0xFF00;
 
 		new.sn = 0;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 }
 
 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -11730,8 +11730,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'notification vector' */
 		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 
 	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
 		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
@@ -11800,8 +11800,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'wakeup vector' */
 		new.nv = POSTED_INTR_WAKEUP_VECTOR;
-	} while (cmpxchg(&pi_desc->control, old.control,
-			new.control) != old.control);
+	} while (cmpxchg64(&pi_desc->control, old.control,
+			   new.control) != old.control);
 
 	/* We should not block the vCPU if an interrupt is posted for it.  */
 	if (pi_test_on(pi_desc) == 1)
-- 
cgit 


From 520a13c530aeb5f63e011d668c42db1af19ed349 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 28 Sep 2017 16:58:26 -0500
Subject: x86/asm: Fix inline asm call constraints for GCC 4.4

The kernel test bot (run by Xiaolong Ye) reported that the following commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

is causing double faults in a kernel compiled with GCC 4.4.

Linus subsequently diagnosed the crash pattern and the buggy commit and found that
the issue is with this code:

  register unsigned int __asm_call_sp asm("esp");
  #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)

Even on a 64-bit kernel, it's using ESP instead of RSP.  That causes GCC
to produce the following bogus code:

  ffffffff8147461d:       89 e0                   mov    %esp,%eax
  ffffffff8147461f:       4c 89 f7                mov    %r14,%rdi
  ffffffff81474622:       4c 89 fe                mov    %r15,%rsi
  ffffffff81474625:       ba 20 00 00 00          mov    $0x20,%edx
  ffffffff8147462a:       89 c4                   mov    %eax,%esp
  ffffffff8147462c:       e8 bf 52 05 00          callq  ffffffff814c98f0 <copy_user_generic_unrolled>

Despite the absurdity of it backing up and restoring the stack pointer
for no reason, the bug is actually the fact that it's only backing up
and restoring the lower 32 bits of the stack pointer.  The upper 32 bits
are getting cleared out, corrupting the stack pointer.

So change the '__asm_call_sp' register variable to be associated with
the actual full-size stack pointer.

This also requires changing the __ASM_SEL() macro to be based on the
actual compiled arch size, rather than the CONFIG value, because
CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso).
Otherwise Clang fails to build the kernel because it complains about the
use of a 64-bit register (RSP) in a 32-bit file.

Reported-and-Bisected-and-Tested-by: kernel test robot <xiaolong.ye@intel.com>
Diagnosed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")
Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/asm.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index c1eadbaf1115..30c3c9ac784a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -11,10 +11,12 @@
 # define __ASM_FORM_COMMA(x) " " #x ","
 #endif
 
-#ifdef CONFIG_X86_32
+#ifndef __x86_64__
+/* 32 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(a)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
 #else
+/* 64 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(b)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
 #endif
@@ -139,7 +141,7 @@
  * gets set up by the containing function.  If you forget to do this, objtool
  * may print a "call without frame pointer save/setup" warning.
  */
-register unsigned int __asm_call_sp asm("esp");
+register unsigned long __asm_call_sp asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
 #endif
 
-- 
cgit 


From 305d0ab4764d36a02c8e7cddb67099aca65351ce Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 28 Sep 2017 18:16:44 -0700
Subject: KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

------------[ cut here ]------------
 WARNING: CPU: 4 PID: 5280 at /home/kernel/linux/arch/x86/kvm//vmx.c:11394 nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel]
 CPU: 4 PID: 5280 Comm: qemu-system-x86 Tainted: G        W  OE   4.13.0+ #17
 RIP: 0010:nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel]
 Call Trace:
  ? emulator_read_emulated+0x15/0x20 [kvm]
  ? segmented_read+0xae/0xf0 [kvm]
  vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel]
  ? vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel]
  x86_emulate_instruction+0x733/0x810 [kvm]
  vmx_handle_exit+0x2f4/0xda0 [kvm_intel]
  ? kvm_arch_vcpu_ioctl_run+0xd2f/0x1c60 [kvm]
  kvm_arch_vcpu_ioctl_run+0xdab/0x1c60 [kvm]
  ? kvm_arch_vcpu_load+0x62/0x230 [kvm]
  kvm_vcpu_ioctl+0x340/0x700 [kvm]
  ? kvm_vcpu_ioctl+0x340/0x700 [kvm]
  ? __fget+0xfc/0x210
  do_vfs_ioctl+0xa4/0x6a0
  ? __fget+0x11d/0x210
  SyS_ioctl+0x79/0x90
  entry_SYSCALL_64_fastpath+0x23/0xc2

A nested #PF is triggered during L0 emulating instruction for L2. However, it
doesn't consider we should not break L1's vmlauch/vmresme. This patch fixes
it by queuing the #PF exception instead ,requesting an immediate VM exit from
L2 and keeping the exception for L1 pending for a subsequent nested VM exit.

This should actually work all the time, making vmx_inject_page_fault_nested
totally unnecessary.  However, that's not working yet, so this patch can work
around the issue in the meanwhile.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7f62c94196d1..5bfa353f6354 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9845,7 +9845,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	WARN_ON(!is_guest_mode(vcpu));
 
-	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+		!to_vmx(vcpu)->nested.nested_run_pending) {
 		vmcs12->vm_exit_intr_error_code = fault->error_code;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-- 
cgit 


From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Fri, 29 Sep 2017 19:01:45 +0800
Subject: kvm/x86: Handle async PF in RCU read-side critical sections

Sasha Levin reported a WARNING:

| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
| rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
| rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
...
| CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
| 1.10.1-1ubuntu1 04/01/2014
| Call Trace:
...
| RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
| RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
| RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002
| RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000
| RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410
| RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001
| R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08
| R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040
| __schedule+0x201/0x2240 kernel/sched/core.c:3292
| schedule+0x113/0x460 kernel/sched/core.c:3421
| kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158
| do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271
| async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069
| RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996
| RSP: 0018:ffff88003b2df520 EFLAGS: 00010283
| RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670
| RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140
| RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000
| R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8
| R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000
| vsnprintf+0x173/0x1700 lib/vsprintf.c:2136
| sprintf+0xbe/0xf0 lib/vsprintf.c:2386
| proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23
| get_link fs/namei.c:1047 [inline]
| link_path_walk+0x1041/0x1490 fs/namei.c:2127
...

This happened when the host hit a page fault, and delivered it as in an
async page fault, while the guest was in an RCU read-side critical
section.  The guest then tries to reschedule in kvm_async_pf_task_wait(),
but rcu_preempt_note_context_switch() would treat the reschedule as a
sleep in RCU read-side critical section, which is not allowed (even in
preemptible RCU).  Thus the WARN.

To cure this, make kvm_async_pf_task_wait() go to the halt path if the
PF happens in a RCU read-side critical section.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index aa60a08b65b1..e675704fa6f7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) || preempt_count() > 1;
+	n.halted = is_idle_task(current) || preempt_count() > 1 ||
+		   rcu_preempt_depth();
 	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	raw_spin_unlock(&b->lock);
-- 
cgit 


From bc829ee36e0ec92383c9d9b88fe08f00d4d592f8 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 29 Sep 2017 11:24:19 -0500
Subject: x86/mm: Disable branch profiling in mem_encrypt.c

Some routines in mem_encrypt.c are called very early in the boot process,
e.g. sme_encrypt_kernel(). When CONFIG_TRACE_BRANCH_PROFILING=y is defined
the resulting branch profiling associated with the check to see if SME is
active results in a kernel crash. Disable branch profiling for
mem_encrypt.c by defining DISABLE_BRANCH_PROFILING before including any
header files.

Reported-by: kernel test robot <lkp@01.org>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929162419.6016.53390.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/mem_encrypt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 3fcc8e01683b..16c5f37933a2 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -10,6 +10,8 @@
  * published by the Free Software Foundation.
  */
 
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-- 
cgit 


From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 29 Sep 2017 17:15:36 +0300
Subject: x86/asm: Use register variable to get stack pointer value

Currently we use current_stack_pointer() function to get the value
of the stack pointer register. Since commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

... we have a stack register variable declared. It can be used instead of
current_stack_pointer() function which allows to optimize away some
excessive "mov %rsp, %<dst>" instructions:

 -mov    %rsp,%rdx
 -sub    %rdx,%rax
 -cmp    $0x3fff,%rax
 -ja     ffffffff810722fd <ist_begin_non_atomic+0x2d>

 +sub    %rsp,%rax
 +cmp    $0x3fff,%rax
 +ja     ffffffff810722fa <ist_begin_non_atomic+0x2a>

Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer
and use it instead of the removed function.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/asm.h         |  4 ++--
 arch/x86/include/asm/thread_info.h | 11 -----------
 arch/x86/kernel/irq_32.c           |  6 +++---
 arch/x86/kernel/traps.c            |  2 +-
 arch/x86/mm/tlb.c                  |  2 +-
 5 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 30c3c9ac784a..b0dc91f4bedc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -141,8 +141,8 @@
  * gets set up by the containing function.  If you forget to do this, objtool
  * may print a "call without frame pointer save/setup" warning.
  */
-register unsigned long __asm_call_sp asm(_ASM_SP);
-#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 5161da1a0fa0..89e7eeb5cec1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -158,17 +158,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-static inline unsigned long current_stack_pointer(void)
-{
-	unsigned long sp;
-#ifdef CONFIG_X86_64
-	asm("mov %%rsp,%0" : "=g" (sp));
-#else
-	asm("mov %%esp,%0" : "=g" (sp));
-#endif
-	return sp;
-}
-
 /*
  * Walks up the stack frames to make sure that the specified object is
  * entirely contained by a single stack frame.
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1f38d9a4d9de..d4eb450144fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack)
 
 static inline void *current_stack(void)
 {
-	return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
+	return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 
 static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
@@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 
 	/* Save the next esp at the bottom of the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	if (unlikely(overflow))
 		call_on_stack(print_stack_overflow, isp);
@@ -139,7 +139,7 @@ void do_softirq_own_stack(void)
 
 	/* Push the previous esp onto the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	call_on_stack(__do_softirq, isp);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 34ea3651362e..67db4f43309e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * from double_fault.
 	 */
 	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer()) >= THREAD_SIZE);
+			       current_stack_pointer) >= THREAD_SIZE);
 
 	preempt_enable_no_resched();
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 93fe97cce581..49d9778376d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -191,7 +191,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 * mapped in the new pgd, we'll double-fault.  Forcibly
 			 * map it.
 			 */
-			unsigned int index = pgd_index(current_stack_pointer());
+			unsigned int index = pgd_index(current_stack_pointer);
 			pgd_t *pgd = next->pgd + index;
 
 			if (unlikely(pgd_none(*pgd)))
-- 
cgit 


From ee213fc72fd67d0988525af501534f4cb924d1e9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Oct 2017 08:51:43 -0500
Subject: kprobes/x86: Set up frame pointer in kprobe trampoline

Richard Weinberger saw an unwinder warning when running bcc's opensnoop:

  WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008
  unwind stack type:0 next_sp:          (null) mask:0x2 graph_idx:0
  ...
  ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0)
  ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90)
  ...

A lockdep stack trace was initiated from inside a kprobe handler, when
the unwinder noticed a bad frame pointer on the stack.  The bad frame
pointer is related to the fact that the kprobe optprobe trampoline
doesn't save the frame pointer before calling into optimized_callback().

Reported-and-tested-by: Richard Weinberger <richard@sigma-star.at>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/common.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
 
 /* Kprobes and Optprobes common header */
 
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING "	push %" _ASM_BP "\n" \
+			 "	mov  %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING "	push %" _ASM_BP "\n"
+#endif
+
 #ifdef CONFIG_X86_64
 #define SAVE_REGS_STRING			\
 	/* Skip cs, ip, orig_ax. */		\
@@ -17,7 +26,7 @@
 	"	pushq %r10\n"			\
 	"	pushq %r11\n"			\
 	"	pushq %rbx\n"			\
-	"	pushq %rbp\n"			\
+	SAVE_RBP_STRING				\
 	"	pushq %r12\n"			\
 	"	pushq %r13\n"			\
 	"	pushq %r14\n"			\
@@ -48,7 +57,7 @@
 	"	pushl %es\n"			\
 	"	pushl %ds\n"			\
 	"	pushl %eax\n"			\
-	"	pushl %ebp\n"			\
+	SAVE_RBP_STRING				\
 	"	pushl %edi\n"			\
 	"	pushl %esi\n"			\
 	"	pushl %edx\n"			\
-- 
cgit 


From b664d57f39d01e775204d4f1a7e2f8bda77bc549 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 3 Oct 2017 16:18:02 +0900
Subject: kprobes/x86: Remove IRQ disabling from jprobe handlers

Jprobes actually don't need to disable IRQs while calling
handlers, because of how we specify the kernel interface in
Documentation/kprobes.txt:

-----
 Probe handlers are run with preemption disabled.  Depending on the
 architecture and optimization state, handlers may also run with
 interrupts disabled (e.g., kretprobe handlers and optimized kprobe
 handlers run without interrupt disabled on x86/x86-64).
-----

So let's remove IRQ disabling from jprobes too.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150701508194.32266.14458959863314097305.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..0742491cbb73 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 * raw stack chunk with redzones:
 	 */
 	__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
-	regs->flags &= ~X86_EFLAGS_IF;
-	trace_hardirqs_off();
 	regs->ip = (unsigned long)(jp->entry);
 
 	/*
-- 
cgit 


From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 3 Oct 2017 15:37:20 -0700
Subject: bpf: fix bpf_tail_call() x64 JIT

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c9573660d51..0554e8aef4d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
-- 
cgit 


From a2b7861bb33b2538420bb5d8554153484d3f961f Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 3 Oct 2017 21:36:51 +0800
Subject: kvm/x86: Avoid async PF preempting the kernel incorrectly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, in PREEMPT_COUNT=n kernel, kvm_async_pf_task_wait() could call
schedule() to reschedule in some cases.  This could result in
accidentally ending the current RCU read-side critical section early,
causing random memory corruption in the guest, or otherwise preempting
the currently running task inside between preempt_disable and
preempt_enable.

The difficulty to handle this well is because we don't know whether an
async PF delivered in a preemptible section or RCU read-side critical section
for PREEMPT_COUNT=n, since preempt_disable()/enable() and rcu_read_lock/unlock()
are both no-ops in that case.

To cure this, we treat any async PF interrupting a kernel context as one
that cannot be preempted, preventing kvm_async_pf_task_wait() from choosing
the schedule() path in that case.

To do so, a second parameter for kvm_async_pf_task_wait() is introduced,
so that we know whether it's called from a context interrupting the
kernel, and the parameter is set properly in all the callsites.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_para.h |  4 ++--
 arch/x86/kernel/kvm.c           | 14 ++++++++++----
 arch/x86/kvm/mmu.c              |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index bc62e7cbf1b1..59ad3d132353 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 void __init kvm_guest_init(void);
-void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void)
 
 #else /* CONFIG_KVM_GUEST */
 #define kvm_guest_init() do {} while (0)
-#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wait(T, I) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
 static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e675704fa6f7..8bb9594d0761 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 	return NULL;
 }
 
-void kvm_async_pf_task_wait(u32 token)
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ * 		      (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
 {
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) || preempt_count() > 1 ||
-		   rcu_preempt_depth();
+	n.halted = is_idle_task(current) ||
+		   (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+		    ? preempt_count() > 1 || rcu_preempt_depth()
+		    : interrupt_kernel);
 	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	raw_spin_unlock(&b->lock);
@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		kvm_async_pf_task_wait((u32)read_cr2());
+		kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca30c1eb1d9..106d4a029a8a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		vcpu->arch.apf.host_apf_reason = 0;
 		local_irq_disable();
-		kvm_async_pf_task_wait(fault_address);
+		kvm_async_pf_task_wait(fault_address, 0);
 		local_irq_enable();
 		break;
 	case KVM_PV_REASON_PAGE_READY:
-- 
cgit 


From 262e681183ddcdb24d64a2f993e41a226adcec29 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 2 Oct 2017 11:28:36 +0200
Subject: x86/mce: Hide mca_cfg

Now that lguest is gone, put it in the internal header which should be
used only by MCA/RAS code.

Add missing header guards while at it.

No functional change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20171002092836.22971-3-bp@alien8.de
---
 arch/x86/include/asm/mce.h                | 1 -
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 7 +++++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c      | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 181264989db5..8edac1de2e35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -187,7 +187,6 @@ struct mca_msr_regs {
 
 extern struct mce_vendor_flags mce_flags;
 
-extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 
 enum mce_notifier_prios {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 098530a93bb7..debb974fd17d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,6 @@
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
 #include <linux/device.h>
 #include <asm/mce.h>
 
@@ -108,3 +111,7 @@ static inline void mce_work_trigger(void)	{ }
 static inline void mce_register_injector_chain(struct notifier_block *nb)	{ }
 static inline void mce_unregister_injector_chain(struct notifier_block *nb)	{ }
 #endif
+
+extern struct mca_config mca_cfg;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 40e28ed77fbf..486f640b02ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,6 +28,8 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
+#include "mce-internal.h"
+
 #define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
-- 
cgit 


From f26e60167d8b5b1c67b3efd4cb5672da446bdb0e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 4 Oct 2017 10:39:05 -0500
Subject: x86/kvm: Move kvm_fastop_exception to .fixup section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling the kernel with the '-frecord-gcc-switches' flag, objtool
complains:

  arch/x86/kvm/emulate.o: warning: objtool: .GCC.command.line+0x0: special: can't find new instruction

And also the kernel fails to link.

The problem is that the 'kvm_fastop_exception' code gets placed into the
throwaway '.GCC.command.line' section instead of '.text'.

Exception fixup code is conventionally placed in the '.fixup' section,
so put it there where it belongs.

Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a36254cbf776..d90cdc77e077 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	#op " %al \n\t" \
 	FOP_RET
 
-asm(".global kvm_fastop_exception \n"
-    "kvm_fastop_exception: xor %esi, %esi; ret");
+asm(".pushsection .fixup, \"ax\"\n"
+    ".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret\n"
+    ".popsection");
 
 FOP_START(setcc)
 FOP_SETCC(seto)
-- 
cgit 


From e42eef4ba38806b18c4a74f0c276fb2e0b548173 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Oct 2017 12:28:18 +0200
Subject: KVM: add X86_LOCAL_APIC dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rework of the posted interrupt handling broke building without
support for the local APIC:

ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined!

That configuration is probably not particularly useful anyway, so
we can avoid the randconfig failures by adding a Kconfig dependency.

Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3ea624452f93..3c48bc8bf08c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
 	depends on HIGH_RES_TIMERS
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET && MULTIUSER
+	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
-- 
cgit 


From 924c6b900cfdf376b07bccfd80e62b21914f8a5a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 8 Oct 2017 21:53:05 -0700
Subject: x86/mm/64: Fix reboot interaction with CR4.PCIDE

Trying to reboot via real mode fails with PCID on: long mode cannot
be exited while CR4.PCIDE is set.  (No, I have no idea why, but the
SDM and actual CPUs are in agreement here.)  The result is a GPF and
a hang instead of a reboot.

I didn't catch this in testing because neither my computer nor my VM
reboots this way.  I can trigger it with reboot=bios, though.

Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
Reported-and-tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org
---
 arch/x86/kernel/reboot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54180fa6f66f..add33f600531 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
 	load_cr3(initial_page_table);
 #else
 	write_cr3(real_mode_header->trampoline_pgd);
+
+	/* Exiting long mode will fail if CR4.PCIDE is set. */
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr4_clear_bits(X86_CR4_PCIDE);
 #endif
 
 	/* Jump to the identity-mapped low memory code */
-- 
cgit 


From 6b32c126d33d5cb379bca280ab8acedc1ca978ff Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 5 Oct 2017 20:30:12 +0200
Subject: x86/alternatives: Fix alt_max_short macro to really be a max()

The alt_max_short() macro in asm/alternative.h does not work as
intended, leading to nasty bugs. E.g. alt_max_short("1", "3")
evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not
exactly the maximum of 1 and 3.

In fact, I had to learn it the hard way by crashing my kernel in not
so funny ways by attempting to make use of the ALTENATIVE_2 macro
with alternatives where the first one was larger than the second
one.

According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix
ALTERNATIVE_2 padding generation properly") the right handed side
should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the
macro work as intended.

While at it, fix up the comments regarding the additional "-", too.
It's not about gas' usage of s32 but brain dead logic of having a
"true" value of -1 for the < operator ... *sigh*

Btw., the one in asm/alternative-asm.h is correct. And, apparently,
all current users of ALTERNATIVE_2() pass same sized alternatives,
avoiding to hit the bug.

[1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax

Reviewed-and-tested-by: Borislav Petkov <bp@suse.de>
Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly")
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com
---
 arch/x86/include/asm/alternative-asm.h | 4 +++-
 arch/x86/include/asm/alternative.h     | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
 #define new_len2		145f-144f
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c096624137ae..ccbe24e697c4 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	alt_end_marker ":\n"
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
  *
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
-#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
 
 /*
  * Pad the second replacement alternative with additional NOPs if it is
-- 
cgit 


From 62dd86ac01f9fb6386d7f8c6b389c3ea4582a50a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:02 -0500
Subject: x86/unwind: Fix dereference of untrusted pointer

Tetsuo Handa and Fengguang Wu reported a panic in the unwinder:

  BUG: unable to handle kernel NULL pointer dereference at 000001f2
  IP: update_stack_state+0xd4/0x340
  *pde = 00000000

  Oops: 0000 [#1] PREEMPT SMP
  CPU: 0 PID: 18728 Comm: 01-cpu-hotplug Not tainted 4.13.0-rc4-00170-gb09be67 #592
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014
  task: bb0b53c0 task.stack: bb3ac000
  EIP: update_stack_state+0xd4/0x340
  EFLAGS: 00010002 CPU: 0
  EAX: 0000a570 EBX: bb3adccb ECX: 0000f401 EDX: 0000a570
  ESI: 00000001 EDI: 000001ba EBP: bb3adc6b ESP: bb3adc3f
   DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
  CR0: 80050033 CR2: 000001f2 CR3: 0b3a7000 CR4: 00140690
  DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
  DR6: fffe0ff0 DR7: 00000400
  Call Trace:
   ? unwind_next_frame+0xea/0x400
   ? __unwind_start+0xf5/0x180
   ? __save_stack_trace+0x81/0x160
   ? save_stack_trace+0x20/0x30
   ? __lock_acquire+0xfa5/0x12f0
   ? lock_acquire+0x1c2/0x230
   ? tick_periodic+0x3a/0xf0
   ? _raw_spin_lock+0x42/0x50
   ? tick_periodic+0x3a/0xf0
   ? tick_periodic+0x3a/0xf0
   ? debug_smp_processor_id+0x12/0x20
   ? tick_handle_periodic+0x23/0xc0
   ? local_apic_timer_interrupt+0x63/0x70
   ? smp_trace_apic_timer_interrupt+0x235/0x6a0
   ? trace_apic_timer_interrupt+0x37/0x3c
   ? strrchr+0x23/0x50
  Code: 0f 95 c1 89 c7 89 45 e4 0f b6 c1 89 c6 89 45 dc 8b 04 85 98 cb 74 bc 88 4d e3 89 45 f0 83 c0 01 84 c9 89 04 b5 98 cb 74 bc 74 3b <8b> 47 38 8b 57 34 c6 43 1d 01 25 00 00 02 00 83 e2 03 09 d0 83
  EIP: update_stack_state+0xd4/0x340 SS:ESP: 0068:bb3adc3f
  CR2: 00000000000001f2
  ---[ end trace 0d147fd4aba8ff50 ]---
  Kernel panic - not syncing: Fatal exception in interrupt

On x86-32, after decoding a frame pointer to get a regs address,
regs_size() dereferences the regs pointer when it checks regs->cs to see
if the regs are user mode.  This is dangerous because it's possible that
what looks like a decoded frame pointer is actually a corrupt value, and
we don't want the unwinder to make things worse.

Instead of calling regs_size() on an unsafe pointer, just assume they're
kernel regs to start with.  Later, once it's safe to access the regs, we
can do the user mode check and corresponding safety check for the
remaining two regs.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 5ed8d8bb38c5 ("x86/unwind: Move common code into update_stack_state()")
Link: http://lkml.kernel.org/r/7f95b9a6993dec7674b3f3ab3dcd3294f7b9644d.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..d05637726c10 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -184,6 +184,12 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 	return (struct pt_regs *)(regs & ~0x1);
 }
 
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
+
 static bool update_stack_state(struct unwind_state *state,
 			       unsigned long *next_bp)
 {
@@ -202,7 +208,7 @@ static bool update_stack_state(struct unwind_state *state,
 	regs = decode_frame_pointer(next_bp);
 	if (regs) {
 		frame = (unsigned long *)regs;
-		len = regs_size(regs);
+		len = KERNEL_REGS_SIZE;
 		state->got_irq = true;
 	} else {
 		frame = next_bp;
@@ -226,6 +232,14 @@ static bool update_stack_state(struct unwind_state *state,
 	    frame < prev_frame_end)
 		return false;
 
+	/*
+	 * On 32-bit with user mode regs, make sure the last two regs are safe
+	 * to access:
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+	    !on_stack(info, frame, len + 2*sizeof(long)))
+		return false;
+
 	/* Move state to the next frame: */
 	if (regs) {
 		state->regs = regs;
-- 
cgit 


From 5c99b692cfd62f6915ed7ee7ed3c38d65ba3feab Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:03 -0500
Subject: x86/unwind: Use MSB for frame pointer encoding on 32-bit

On x86-32, Tetsuo Handa and Fengguang Wu reported unwinder warnings
like:

  WARNING: kernel stack regs at f60bb9c8 in swapper:1 has bad 'bp' value 0ba00000

And also there were some stack dumps with a bunch of unreliable '?'
symbols after an apic_timer_interrupt symbol, meaning the unwinder got
confused when it tried to read the regs.

The cause of those issues is that, with GCC 4.8 (and possibly older),
there are cases where GCC misaligns the stack pointer in a leaf function
for no apparent reason:

  c124a388 <acpi_rs_move_data>:
  c124a388:       55                      push   %ebp
  c124a389:       89 e5                   mov    %esp,%ebp
  c124a38b:       57                      push   %edi
  c124a38c:       56                      push   %esi
  c124a38d:       89 d6                   mov    %edx,%esi
  c124a38f:       53                      push   %ebx
  c124a390:       31 db                   xor    %ebx,%ebx
  c124a392:       83 ec 03                sub    $0x3,%esp
  ...
  c124a3e3:       83 c4 03                add    $0x3,%esp
  c124a3e6:       5b                      pop    %ebx
  c124a3e7:       5e                      pop    %esi
  c124a3e8:       5f                      pop    %edi
  c124a3e9:       5d                      pop    %ebp
  c124a3ea:       c3                      ret

If an interrupt occurs in such a function, the regs on the stack will be
unaligned, which breaks the frame pointer encoding assumption.  So on
32-bit, use the MSB instead of the LSB to encode the regs.

This isn't an issue on 64-bit, because interrupts align the stack before
writing to it.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/279a26996a482ca716605c7dbc7f2db9d8d91e81.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S      |  4 ++--
 arch/x86/kernel/unwind_frame.c | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8a13d468635a..50e0d2bc4528 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -176,7 +176,7 @@
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
  * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
+ * is just clearing the MSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
  * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
@@ -185,7 +185,7 @@
 .macro ENCODE_FRAME_POINTER
 #ifdef CONFIG_FRAME_POINTER
 	mov %esp, %ebp
-	orl $0x1, %ebp
+	andl $0x7fffffff, %ebp
 #endif
 .endm
 
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d05637726c10..4949bbc95f75 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -174,6 +174,7 @@ static bool is_last_task_frame(struct unwind_state *state)
  * This determines if the frame pointer actually contains an encoded pointer to
  * pt_regs on the stack.  See ENCODE_FRAME_POINTER.
  */
+#ifdef CONFIG_X86_64
 static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 {
 	unsigned long regs = (unsigned long)bp;
@@ -183,6 +184,17 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 
 	return (struct pt_regs *)(regs & ~0x1);
 }
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+	unsigned long regs = (unsigned long)bp;
+
+	if (regs & 0x80000000)
+		return NULL;
+
+	return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
 
 #ifdef CONFIG_X86_32
 #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
-- 
cgit 


From 99bd28a49b150e4b938313a63b5532d95ba77885 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:04 -0500
Subject: x86/unwind: Align stack pointer in unwinder dump

When printing the unwinder dump, the stack pointer could be unaligned,
for one of two reasons:

- stack corruption; or

- GCC created an unaligned stack.

There's no way for the unwinder to tell the difference between the two,
so we have to assume one or the other.  GCC unaligned stacks are very
rare, and have only been spotted before GCC 5.  Presumably, if we're
doing an unwinder stack dump, stack corruption is more likely than a
GCC unaligned stack.  So always align the stack before starting the
dump.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2f540c515946ab09ed267e1a1d6421202a0cce08.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 4949bbc95f75..81aca077fbb6 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
 			state->stack_info.type, state->stack_info.next_sp,
 			state->stack_mask, state->graph_idx);
 
-	for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+	for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+	     sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
 		if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
 			break;
 
-- 
cgit 


From d4a2d031dd42f9594107d317e2a9a0c6d73ad46b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Oct 2017 20:20:05 -0500
Subject: x86/unwind: Disable unwinder warnings on 32-bit

x86-32 doesn't have stack validation, so in most cases it doesn't make
sense to warn about bad frame pointers.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a69658760800bf281e6353248c23e0fa0acf5230.1507597785.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 81aca077fbb6..3dc26f95d46e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -355,6 +355,13 @@ bad_address:
 	    state->regs->sp < (unsigned long)task_pt_regs(state->task))
 		goto the_end;
 
+	/*
+	 * There are some known frame pointer issues on 32-bit.  Disable
+	 * unwinder warnings on 32-bit until it gets objtool support.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32))
+		goto the_end;
+
 	if (state->regs) {
 		printk_deferred_once(KERN_WARNING
 			"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
-- 
cgit 


From 629eb703d3e46aa570c6c91235d38fd09ed8c58b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 9 Oct 2017 18:26:55 +0100
Subject: perf/x86/intel/uncore: Fix memory leaks on allocation failures

Currently if an allocation fails then the error return paths
don't free up any currently allocated pmus[].boxes and pmus causing
a memory leak.  Add an error clean up exit path that frees these
objects.

Detected by CoverityScan, CID#711632 ("Resource Leak")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Fixes: 087bfbb03269 ("perf/x86: Add generic Intel uncore PMU support")
Link: http://lkml.kernel.org/r/20171009172655.6132-1-colin.king@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 1c5390f1cf09..d45e06346f14 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 		pmus[i].type	= type;
 		pmus[i].boxes	= kzalloc(size, GFP_KERNEL);
 		if (!pmus[i].boxes)
-			return -ENOMEM;
+			goto err;
 	}
 
 	type->pmus = pmus;
@@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
 		if (!attr_group)
-			return -ENOMEM;
+			goto err;
 
 		attrs = (struct attribute **)(attr_group + 1);
 		attr_group->name = "events";
@@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 	}
 
 	type->pmu_group = &uncore_pmu_attr_group;
+
 	return 0;
+
+err:
+	for (i = 0; i < type->num_boxes; i++)
+		kfree(pmus[i].boxes);
+	kfree(pmus);
+
+	return -ENOMEM;
 }
 
 static int __init
-- 
cgit 


From a3b7424392924e778b608e30ee321f7b10cc94b8 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 6 Oct 2017 17:48:54 +0200
Subject: x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded
 vCPUs

hv_flush_pcpu_ex structures are not cleared between calls for performance
reasons (they're variable size up to PAGE_SIZE each) but we must clear
hv_vp_set.bank_contents part of it to avoid flushing unneeded vCPUs. The
rest of the structure is formed correctly.

To do the clearing in an efficient way stash the maximum possible vCPU
number (this may differ from Linux CPU id).

Reported-by: Jork Loeser <Jork.Loeser@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20171006154854.18092-1-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/hv_init.c       |  5 +++++
 arch/x86/hyperv/mmu.c           | 17 ++++++++++++-----
 arch/x86/include/asm/mshyperv.h |  1 +
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a8eb550c40f..a5db63f728a2 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 u32 *hv_vp_index;
 EXPORT_SYMBOL_GPL(hv_vp_index);
 
+u32 hv_max_vp_index;
+
 static int hv_cpu_init(unsigned int cpu)
 {
 	u64 msr_vp_index;
@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
 
 	hv_vp_index[smp_processor_id()] = msr_vp_index;
 
+	if (msr_vp_index > hv_max_vp_index)
+		hv_max_vp_index = msr_vp_index;
+
 	return 0;
 }
 
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 39e7f6e50919..9502d04c0c95 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
 {
 	int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
 
+	/* valid_bank_mask can represent up to 64 banks */
+	if (hv_max_vp_index / 64 >= 64)
+		return 0;
+
+	/*
+	 * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+	 * structs are not cleared between calls, we risk flushing unneeded
+	 * vCPUs otherwise.
+	 */
+	for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+		flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
+
 	/*
 	 * Some banks may end up being empty but this is acceptable.
 	 */
@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
 		vcpu = hv_cpu_number_to_vp_number(cpu);
 		vcpu_bank = vcpu / 64;
 		vcpu_offset = vcpu % 64;
-
-		/* valid_bank_mask can represent up to 64 banks */
-		if (vcpu_bank >= 64)
-			return 0;
-
 		__set_bit(vcpu_offset, (unsigned long *)
 			  &flush->hv_vp_set.bank_contents[vcpu_bank]);
 		if (vcpu_bank >= nr_bank)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 738503e1f80c..530f448fddaf 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
  * to this information.
  */
 extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
 
 /**
  * hv_cpu_number_to_vp_number() - Map CPU to VP.
-- 
cgit 


From 60d73a7c96601434dfdb56d5b9167ff3b850d8d7 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 5 Oct 2017 13:39:24 +0200
Subject: x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex
 structures

hv_do_hypercall() does virt_to_phys() translation and with some configs
(CONFIG_SLAB) this doesn't work for percpu areas, we pass wrong memory to
hypervisor and get #GP. We could use working slow_virt_to_phys() instead
but doing so kills the performance.

Move pcpu_flush/pcpu_flush_ex structures out of percpu areas and
allocate memory on first call. The additional level of indirection gives
us a small performance penalty, in future we may consider introducing
hypercall functions which avoid virt_to_phys() conversion and cache
physical addresses of pcpu_flush/pcpu_flush_ex structures somewhere.

Reported-by: Simon Xiao <sixiao@microsoft.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20171005113924.28021-1-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/mmu.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 9502d04c0c95..f21cebbb5f6c 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
 /* Each gva in gva_list encodes up to 4096 pages to flush */
 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
 
-static struct hv_flush_pcpu __percpu *pcpu_flush;
+static struct hv_flush_pcpu __percpu **pcpu_flush;
 
-static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
+static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
 
 /*
  * Fills in gva_list starting from offset. Returns the number of items added.
@@ -109,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 				    const struct flush_tlb_info *info)
 {
 	int cpu, vcpu, gva_n, max_gvas;
+	struct hv_flush_pcpu **flush_pcpu;
 	struct hv_flush_pcpu *flush;
 	u64 status = U64_MAX;
 	unsigned long flags;
@@ -123,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush = this_cpu_ptr(pcpu_flush);
+	flush_pcpu = this_cpu_ptr(pcpu_flush);
+
+	if (unlikely(!*flush_pcpu))
+		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+	flush = *flush_pcpu;
+
+	if (unlikely(!flush)) {
+		local_irq_restore(flags);
+		goto do_native;
+	}
 
 	if (info->mm) {
 		flush->address_space = virt_to_phys(info->mm->pgd);
@@ -180,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 				       const struct flush_tlb_info *info)
 {
 	int nr_bank = 0, max_gvas, gva_n;
+	struct hv_flush_pcpu_ex **flush_pcpu;
 	struct hv_flush_pcpu_ex *flush;
 	u64 status = U64_MAX;
 	unsigned long flags;
@@ -194,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 
 	local_irq_save(flags);
 
-	flush = this_cpu_ptr(pcpu_flush_ex);
+	flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
+
+	if (unlikely(!*flush_pcpu))
+		*flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+	flush = *flush_pcpu;
+
+	if (unlikely(!flush)) {
+		local_irq_restore(flags);
+		goto do_native;
+	}
 
 	if (info->mm) {
 		flush->address_space = virt_to_phys(info->mm->pgd);
@@ -273,7 +295,7 @@ void hyper_alloc_mmu(void)
 		return;
 
 	if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
-		pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+		pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
 	else
-		pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+		pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
 }
-- 
cgit 


From ab7ff471aa5db670197070760f022622793da7e5 Mon Sep 17 00:00:00 2001
From: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Date: Thu, 5 Oct 2017 10:34:29 -0300
Subject: x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing

Do not consider the fixed size of hv_vp_set when passing the variable
header size to hv_do_rep_hypercall().

The Hyper-V hypervisor specification states that for a hypercall with a
variable header only the size of the variable portion should be supplied
via the input control.

For HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX/LIST_EX calls that means the
fixed portion of hv_vp_set should not be considered.

That fixes random failures of some applications that are unexpectedly
killed with SIGBUS or SIGSEGV.

Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: devel@linuxdriverproject.org
Fixes: 628f54cc6451 ("x86/hyper-v: Support extended CPU ranges for TLB flush hypercalls")
Link: http://lkml.kernel.org/r/1507210469-29065-1-git-send-email-marcelo.cerri@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index f21cebbb5f6c..9cc9e1c1e2db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -251,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-			0, nr_bank + 2, flush, NULL);
+			0, nr_bank, flush, NULL);
 	} else if (info->end &&
 		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-			0, nr_bank + 2, flush, NULL);
+			0, nr_bank, flush, NULL);
 	} else {
 		gva_n = fill_gva_list(flush->gva_list, nr_bank,
 				      info->start, info->end);
 		status = hv_do_rep_hypercall(
 			HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
-			gva_n, nr_bank + 2, flush, NULL);
+			gva_n, nr_bank, flush, NULL);
 	}
 
 	local_irq_restore(flags);
-- 
cgit 


From eac779aa509d453a55da0ea4302bdb79c4e0854f Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Sun, 8 Oct 2017 19:58:46 -0700
Subject: xen/vcpu: Use a unified name about cpu hotplug state for pv and pvhvm

As xen_cpuhp_setup is called by PV and PVHVM, the name of "x86/xen/hvm_guest"
is confusing.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0e7ef69e8531..d669e9d89001 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
 	int rc;
 
 	rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
-				       "x86/xen/hvm_guest:prepare",
+				       "x86/xen/guest:prepare",
 				       cpu_up_prepare_cb, cpu_dead_cb);
 	if (rc >= 0) {
 		rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					       "x86/xen/hvm_guest:online",
+					       "x86/xen/guest:online",
 					       xen_cpu_up_online, NULL);
 		if (rc < 0)
 			cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
-- 
cgit 


From fd19d3b45164466a4adce7cbff448ba9189e1427 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 5 Oct 2017 11:10:22 +0200
Subject: KVM: nVMX: update last_nonleaf_level when initializing nested EPT

The function updates context->root_level but didn't call
update_last_nonleaf_level so the previous and potentially wrong value
was used for page walks.  For example, a zero value of last_nonleaf_level
would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's
walk_addr_generic function (CVE-2017-12188).

Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 106d4a029a8a..3c25f20115bc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 
 	update_permission_bitmask(vcpu, context, true);
 	update_pkru_bitmask(vcpu, context, true);
+	update_last_nonleaf_level(vcpu, context);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
-- 
cgit 


From 829ee279aed43faa5cb1e4d65c0cad52f2426c53 Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 5 Oct 2017 11:10:23 +0200
Subject: KVM: MMU: always terminate page walks at level 1

is_last_gpte() is not equivalent to the pseudo-code given in commit
6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect
value of last_nonleaf_level may override the result even if level == 1.

It is critical for is_last_gpte() to return true on level == 1 to
terminate page walks. Otherwise memory corruption may occur as level
is used as an index to various data structures throughout the page
walking code.  Even though the actual bug would be wherever the MMU is
initialized (as in the previous patch), be defensive and ensure here
that is_last_gpte() returns the correct value.

This patch is also enough to fix CVE-2017-12188.

Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2
Cc: stable@vger.kernel.org
Cc: Andy Honig <ahonig@google.com>
Signed-off-by: Ladi Prosek <lprosek@redhat.com>
[Panic if walk_addr_generic gets an incorrect level; this is a serious
 bug and it's not worth a WARN_ON where the recovery path might hide
 further exploitable issues; suggested by Andrew Honig. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 14 +++++++-------
 arch/x86/kvm/paging_tmpl.h |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3c25f20115bc..7a69cf053711 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3973,13 +3973,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 static inline bool is_last_gpte(struct kvm_mmu *mmu,
 				unsigned level, unsigned gpte)
 {
-	/*
-	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
-	 */
-	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
 	/*
 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
 	 * If it is clear, there are no large pages at this level, so clear
@@ -3987,6 +3980,13 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 	 */
 	gpte &= level - mmu->last_nonleaf_level;
 
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
 	return gpte & PT_PAGE_SIZE_MASK;
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 86b68dc5a649..f18d1f8d332b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -334,10 +334,11 @@ retry_walk:
 		--walker->level;
 
 		index = PT_INDEX(addr, walker->level);
-
 		table_gfn = gpte_to_gfn(pte);
 		offset    = index * sizeof(pt_element_t);
 		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+
+		BUG_ON(walker->level < 1);
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-- 
cgit 


From 67bb8e999e0aeac285d22f0e53c856b9df5282c6 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 10 Oct 2017 14:45:04 -0500
Subject: x86/mm: Disable various instrumentations of mm/mem_encrypt.c and
 mm/tlb.c

Some routines in mem_encrypt.c are called very early in the boot process,
e.g. sme_enable().  When CONFIG_KCOV=y is defined the resulting code added
to sme_enable() (and others) for KCOV instrumentation results in a kernel
crash.  Disable the KCOV instrumentation for mem_encrypt.c by adding
KCOV_INSTRUMENT_mem_encrypt.o := n to arch/x86/mm/Makefile.

In order to avoid other possible early boot issues, model mem_encrypt.c
after head64.c in regards to tools. In addition to disabling KCOV as
stated above and a previous patch that disables branch profiling, also
remove the "-pg" CFLAG if CONFIG_FUNCTION_TRACER is enabled and set
KASAN_SANITIZE to "n", each of which are done on a file basis.

Reported-by: kernel test robot <lkp@01.org>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171010194504.18887.38053.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..e1f095884386 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,12 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o	:= n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o		:= n
+KCOV_INSTRUMENT_mem_encrypt.o	:= n
+
+KASAN_SANITIZE_mem_encrypt.o	:= n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o	= -pg
+endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
-- 
cgit 


From 8eb3f87d903168bdbd1222776a6b1e281f50513e Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Tue, 10 Oct 2017 15:01:22 +0800
Subject: KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit

When KVM emulates an exit from L2 to L1, it loads L1 CR4 into the
guest CR4. Before this CR4 loading, the guest CR4 refers to L2
CR4. Because these two CR4's are in different levels of guest, we
should vmx_set_cr4() rather than kvm_set_cr4() here. The latter, which
is used to handle guest writes to its CR4, checks the guest change to
CR4 and may fail if the change is invalid.

The failure may cause trouble. Consider we start
  a L1 guest with non-zero L1 PCID in use,
     (i.e. L1 CR4.PCIDE == 1 && L1 CR3.PCID != 0)
and
  a L2 guest with L2 PCID disabled,
     (i.e. L2 CR4.PCIDE == 0)
and following events may happen:

1. If kvm_set_cr4() is used in load_vmcs12_host_state() to load L1 CR4
   into guest CR4 (in VMCS01) for L2 to L1 exit, it will fail because
   of PCID check. As a result, the guest CR4 recorded in L0 KVM (i.e.
   vcpu->arch.cr4) is left to the value of L2 CR4.

2. Later, if L1 attempts to change its CR4, e.g., clearing VMXE bit,
   kvm_set_cr4() in L0 KVM will think L1 also wants to enable PCID,
   because the wrong L2 CR4 is used by L0 KVM as L1 CR4. As L1
   CR3.PCID != 0, L0 KVM will inject GP to L1 guest.

Fixes: 4704d0befb072 ("KVM: nVMX: Exiting from L2 to L1")
Cc: qemu-stable@nongnu.org
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a2b804e10c95..95a01609d7ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11297,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-	kvm_set_cr4(vcpu, vmcs12->host_cr4);
+	vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
 	nested_ept_uninit_mmu_context(vcpu);
 
-- 
cgit 


From cc6afe2240298049585e86b1ade85efc8a7f225d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Oct 2017 12:12:57 +0200
Subject: x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on
 hypervisors

Commit 594a30fb1242 ("x86/apic: Silence "FW_BUG TSC_DEADLINE disabled
due to Errata" on CPUs without the feature", 2017-08-30) was also about
silencing the warning on VirtualBox; however, KVM does expose the TSC
deadline timer, and it's virtualized so that it is immune from CPU errata.

Therefore, booting 4.13 with "-cpu Haswell" shows this in the logs:

     [    0.000000] [Firmware Bug]: TSC_DEADLINE disabled due to Errata;
                    please update microcode to version: 0xb2 (or later)

Even if you had a hypervisor that does _not_ virtualize the TSC deadline
and rather exposes the hardware one, it should be the hypervisors task
to update microcode and possibly hide the flag from CPUID.  So just
hide the message when running on _any_ hypervisor, not just those that
do not support the TSC deadline timer.

The older check still makes sense, so keep it.

Fixes: bd9240a18e ("x86/apic: Add TSC_DEADLINE quirk due to errata")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: kvm@vger.kernel.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1507630377-54471-1-git-send-email-pbonzini@redhat.com
---
 arch/x86/kernel/apic/apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d705c769f77d..50109eae8cd7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -600,7 +600,8 @@ static void apic_check_deadline_errata(void)
 	const struct x86_cpu_id *m;
 	u32 rev;
 
-	if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+	if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+	    boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return;
 
 	m = x86_match_cpu(deadline_match);
-- 
cgit 


From 616dd5872e52493863b0202632703eebd51243dc Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 11 Oct 2017 17:16:04 -0400
Subject: x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping

SKX stepping-3 fixed the TSC_DEADLINE issue in a different ucode
version number than stepping-4.  Linux needs to know this stepping-3
specific version number to also enable the TSC_DEADLINE on stepping-3.

The steppings and ucode versions are documented in the SKX BIOS update:
https://downloadmirror.intel.com/26978/eng/ReleaseNotes_R00.01.0004.txt

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Link: https://lkml.kernel.org/r/60f2bbf7cf617e212b522e663f84225bfebc50e5.1507756305.git.len.brown@intel.com
---
 arch/x86/kernel/apic/apic.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 50109eae8cd7..ff891772c9f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
 	return ~0U;
 }
 
+static u32 skx_deadline_rev(void)
+{
+	switch (boot_cpu_data.x86_mask) {
+	case 0x03: return 0x01000136;
+	case 0x04: return 0x02000014;
+	}
+
+	return ~0U;
+}
+
 static const struct x86_cpu_id deadline_match[] = {
 	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X,	hsx_deadline_rev),
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X,	0x0b000020),
 	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D,	bdx_deadline_rev),
-	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X,	0x02000014),
+	DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X,	skx_deadline_rev),
 
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE,	0x22),
 	DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT,	0x20),
-- 
cgit 


From b956575bed91ecfb136a8300742ecbbf451471ab Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 9 Oct 2017 09:50:49 -0700
Subject: x86/mm: Flush more aggressively in lazy TLB mode

Since commit:

  94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")

x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.

From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.

Unfortunately, I forgot about a subtlety.  By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent.  This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.

I can imagine this causing two different problems:

 - A speculative page walk starting from a bogus page table could read
   IO addresses.  I haven't seen any reports of this causing problems.

 - A speculative page walk that involves a bogus page table can install
   garbage in the TLB.  Such garbage would always be at a user VA, but
   some AMD CPUs have logic that triggers a machine check when it notices
   these bogus entries.  I've seen a couple reports of this.

Boris further explains the failure mode:

> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.

To fix this, reinstate TLB coherence in lazy mode.  With this patch
applied, we do it in one of two ways:

 - If we have PCID, we simply switch back to init_mm's page tables
   when we enter a kernel thread -- this seems to be quite cheap
   except for the cost of serializing the CPU.

 - If we don't have PCID, then we set a flag and switch to init_mm
   the first time we would otherwise need to flush the TLB.

The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.

In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed.  Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h |   8 +-
 arch/x86/include/asm/tlbflush.h    |  24 ++++++
 arch/x86/mm/tlb.c                  | 153 +++++++++++++++++++++++++++----------
 3 files changed, 136 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-	int cpu = smp_processor_id();
-
-	if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
-		cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4893abf7f74f..d362161d3291 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
+ * to init_mm when we switch to a kernel thread (e.g. the idle thread).  If
+ * it's false, then we immediately switch CR3 when entering a kernel thread.
+ */
+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
 /*
  * 6 because 6 should be plenty and struct tlb_state will fit in
  * two cache lines.
@@ -104,6 +111,23 @@ struct tlb_state {
 	u16 loaded_mm_asid;
 	u16 next_asid;
 
+	/*
+	 * We can be in one of several states:
+	 *
+	 *  - Actively using an mm.  Our CPU's bit will be set in
+	 *    mm_cpumask(loaded_mm) and is_lazy == false;
+	 *
+	 *  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
+	 *    will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+	 *
+	 *  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
+	 *    is set in mm_cpumask(loaded_mm), but is_lazy == true.
+	 *    We're heuristically guessing that the CR3 load we
+	 *    skipped more than makes up for the overhead added by
+	 *    lazy mode.
+	 */
+	bool is_lazy;
+
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..658bf0090565 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 			    u16 *new_asid, bool *need_flush)
 {
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
 		return;
 
 	/* Warn if we're not lazy. */
-	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
 
 	switch_mm(NULL, &init_mm, NULL);
 }
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		__flush_tlb_all();
 	}
 #endif
+	this_cpu_write(cpu_tlbstate.is_lazy, false);
 
 	if (real_prev == next) {
 		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			  next->context.ctx_id);
 
-		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
-			/*
-			 * There's nothing to do: we weren't lazy, and we
-			 * aren't changing our mm.  We don't need to flush
-			 * anything, nor do we need to update CR3, CR4, or
-			 * LDTR.
-			 */
-			return;
-		}
-
-		/* Resume remote flushes and then read tlb_gen. */
-		cpumask_set_cpu(cpu, mm_cpumask(next));
-		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
-		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
-		    next_tlb_gen) {
-			/*
-			 * Ideally, we'd have a flush_tlb() variant that
-			 * takes the known CR3 value as input.  This would
-			 * be faster on Xen PV and on hypothetical CPUs
-			 * on which INVPCID is fast.
-			 */
-			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
-				       next_tlb_gen);
-			write_cr3(build_cr3(next, prev_asid));
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-					TLB_FLUSH_ALL);
-		}
-
 		/*
-		 * We just exited lazy mode, which means that CR4 and/or LDTR
-		 * may be stale.  (Changes to the required CR4 and LDTR states
-		 * are not reflected in tlb_gen.)
+		 * We don't currently support having a real mm loaded without
+		 * our cpu set in mm_cpumask().  We have all the bookkeeping
+		 * in place to figure out whether we would need to flush
+		 * if our cpu were cleared in mm_cpumask(), but we don't
+		 * currently use it.
 		 */
+		if (WARN_ON_ONCE(real_prev != &init_mm &&
+				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+			cpumask_set_cpu(cpu, mm_cpumask(next));
+
+		return;
 	} else {
 		u16 new_asid;
 		bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		}
 
 		/* Stop remote flushes for the previous mm */
-		if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
-			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
-		VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+				real_prev != &init_mm);
+		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
 		/*
 		 * Start remote flushes and then read tlb_gen.
@@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+		return;
+
+	if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+		/*
+		 * There's a significant optimization that may be possible
+		 * here.  We have accurate enough TLB flush tracking that we
+		 * don't need to maintain coherence of TLB per se when we're
+		 * lazy.  We do, however, need to maintain coherence of
+		 * paging-structure caches.  We could, in principle, leave our
+		 * old mm loaded and only switch to init_mm when
+		 * tlb_remove_page() happens.
+		 */
+		this_cpu_write(cpu_tlbstate.is_lazy, true);
+	} else {
+		switch_mm(NULL, &init_mm, NULL);
+	}
+}
+
 /*
  * Call this when reinitializing a CPU.  It fixes the following potential
  * problems:
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
+	if (unlikely(loaded_mm == &init_mm))
+		return;
+
 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
 		   loaded_mm->context.ctx_id);
 
-	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
 		/*
-		 * We're in lazy mode -- don't flush.  We can get here on
-		 * remote flushes due to races and on local flushes if a
-		 * kernel thread coincidentally flushes the mm it's lazily
-		 * still using.
+		 * We're in lazy mode.  We need to at least flush our
+		 * paging-structure cache to avoid speculatively reading
+		 * garbage into our TLB.  Since switching to init_mm is barely
+		 * slower than a minimal flush, just switch to init_mm.
 		 */
+		switch_mm_irqs_off(NULL, &init_mm, NULL);
 		return;
 	}
 
@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+				 size_t count, loff_t *ppos)
+{
+	char buf[2];
+
+	buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+	buf[1] = '\n';
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t tlblazy_write_file(struct file *file,
+		 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	bool val;
+
+	if (kstrtobool_from_user(user_buf, count, &val))
+		return -EINVAL;
+
+	if (val)
+		static_branch_enable(&tlb_use_lazy_mode);
+	else
+		static_branch_disable(&tlb_use_lazy_mode);
+
+	return count;
+}
+
+static const struct file_operations fops_tlblazy = {
+	.read = tlblazy_read_file,
+	.write = tlblazy_write_file,
+	.llseek = default_llseek,
+};
+
+static int __init init_tlb_use_lazy_mode(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCID)) {
+		/*
+		 * Heuristic: with PCID on, switching to and from
+		 * init_mm is reasonably fast, but remote flush IPIs
+		 * as expensive as ever, so turn off lazy TLB mode.
+		 *
+		 * We can't do this in setup_pcid() because static keys
+		 * haven't been initialized yet, and it would blow up
+		 * badly.
+		 */
+		static_branch_disable(&tlb_use_lazy_mode);
+	}
+
+	debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+			    arch_debugfs_dir, NULL, &fops_tlblazy);
+	return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);
-- 
cgit 


From 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 12 Oct 2017 13:23:16 +0200
Subject: x86/microcode: Do the family check first

On CPUs like AMD's Geode, for example, we shouldn't even try to load
microcode because they do not support the modern microcode loading
interface.

However, we do the family check *after* the other checks whether the
loader has been disabled on the command line or whether we're running in
a guest.

So move the family checks first in order to exit early if we're being
loaded on an unsupported family.

Reported-and-tested-by: Sven Glodowski <glodi1@arcor.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.11..
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396
Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 86e8f0b2537b..c4fa4a85d4cb 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (!have_cpuid_p())
-		return *res;
-
 	/*
 	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
 	 * completely accurate as xen pv guests don't see that CPUID bit set but
@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
 void __init load_ucode_bsp(void)
 {
 	unsigned int cpuid_1_eax;
+	bool intel = true;
 
-	if (check_loader_disabled_bsp())
+	if (!have_cpuid_p())
 		return;
 
 	cpuid_1_eax = native_cpuid_eax(1);
 
 	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
-		if (x86_family(cpuid_1_eax) >= 6)
-			load_ucode_intel_bsp();
+		if (x86_family(cpuid_1_eax) < 6)
+			return;
 		break;
+
 	case X86_VENDOR_AMD:
-		if (x86_family(cpuid_1_eax) >= 0x10)
-			load_ucode_amd_bsp(cpuid_1_eax);
+		if (x86_family(cpuid_1_eax) < 0x10)
+			return;
+		intel = false;
 		break;
+
 	default:
-		break;
+		return;
 	}
+
+	if (check_loader_disabled_bsp())
+		return;
+
+	if (intel)
+		load_ucode_intel_bsp();
+	else
+		load_ucode_amd_bsp(cpuid_1_eax);
 }
 
 static bool check_loader_disabled_ap(void)
-- 
cgit