From d6d55f0b9d900673548515614b56ab55aa2c51f8 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.w.shin@gmail.com>
Date: Thu, 29 May 2014 17:26:50 +0200
Subject: perf/x86/amd: AMD support for bp_len > HW_BREAKPOINT_LEN_8

Implement hardware breakpoint address mask for AMD Family 16h and
above processors. CPUID feature bit indicates hardware support for
DRn_ADDR_MASK MSRs. These masks further qualify DRn/DR7 hardware
breakpoint addresses to allow matching of larger addresses ranges.

Valuable advice and pseudo code from Oleg Nesterov <oleg@redhat.com>

Signed-off-by: Jacob Shin <jacob.w.shin@gmail.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: xiakaixu <xiakaixu@huawei.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/include/asm/cpufeature.h    | 2 ++
 arch/x86/include/asm/debugreg.h      | 5 +++++
 arch/x86/include/asm/hw_breakpoint.h | 1 +
 3 files changed, 8 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0bb1335313b2..53966d65591e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -174,6 +174,7 @@
 #define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
 #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
 
 /*
@@ -383,6 +384,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_has_cx16		boot_cpu_has(X86_FEATURE_CX16)
 #define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)
 #define cpu_has_topoext		boot_cpu_has(X86_FEATURE_TOPOEXT)
+#define cpu_has_bpext		boot_cpu_has(X86_FEATURE_BPEXT)
 
 #if __GNUC__ >= 4
 extern void warn_pre_alternatives(void);
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 61fd18b83b6c..12cb66f6d3a5 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -114,5 +114,10 @@ static inline void debug_stack_usage_inc(void) { }
 static inline void debug_stack_usage_dec(void) { }
 #endif /* X86_64 */
 
+#ifdef CONFIG_CPU_SUP_AMD
+extern void set_dr_addr_mask(unsigned long mask, int dr);
+#else
+static inline void set_dr_addr_mask(unsigned long mask, int dr) { }
+#endif
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index ef1c4d2d41ec..6c98be864a75 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -12,6 +12,7 @@
  */
 struct arch_hw_breakpoint {
 	unsigned long	address;
+	unsigned long	mask;
 	u8		len;
 	u8		type;
 };
-- 
cgit 


From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:19 -0600
Subject: livepatch: kernel: add support for live patching

This commit introduces code for the live patching core.  It implements
an ftrace-based mechanism and kernel interface for doing live patching
of kernel and kernel module functions.

It represents the greatest common functionality set between kpatch and
kgraft and can accept patches built using either method.

This first version does not implement any consistency mechanism that
ensures that old and new code do not run together.  In practice, ~90% of
CVEs are safe to apply in this way, since they simply add a conditional
check.  However, any function change that can not execute safely with
the old version of the function can _not_ be safely applied in this
version.

[ jkosina@suse.cz: due to the number of contributions that got folded into
  this original patch from Seth Jennings, add SUSE's copyright as well, as
  discussed via e-mail ]

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 arch/x86/include/asm/livepatch.h

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..d529db1b1edf
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,37 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+
+#ifdef CONFIG_LIVE_PATCHING
+#ifndef CC_USING_FENTRY
+#error Your compiler must support -mfentry for live patching to work
+#endif
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+				  unsigned long loc, unsigned long value);
+
+#else
+#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
-- 
cgit 


From b5bfc51707f1b56b0b733980bb4fcc0562bf02d8 Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Fri, 19 Dec 2014 14:11:17 +0800
Subject: livepatch: move x86 specific ftrace handler code to arch/x86

The execution flow redirection related implemention in the livepatch
ftrace handler is depended on the specific architecture. This patch
introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change
the pt_regs.

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index d529db1b1edf..b5608d7757fd 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -22,6 +22,7 @@
 #define _ASM_X86_LIVEPATCH_H
 
 #include <linux/module.h>
+#include <linux/ftrace.h>
 
 #ifdef CONFIG_LIVE_PATCHING
 #ifndef CC_USING_FENTRY
@@ -30,6 +31,10 @@
 extern int klp_write_module_reloc(struct module *mod, unsigned long type,
 				  unsigned long loc, unsigned long value);
 
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
 #else
 #error Live patching support is disabled; check CONFIG_LIVE_PATCHING
 #endif
-- 
cgit 


From 6ca7a8a15035add0a4f9b2fd658118d41dbeb20c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 21 Dec 2014 15:02:23 +0100
Subject: x86/fpu: Use a symbolic name for asm operand

Fix up the else-case in fpu_fxsave() which seems like it has
been overlooked. Correct comment style in restore_fpu_checking()
while at it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1419170543-11393-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e97622f57722..0dbc08282291 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -207,7 +207,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
 	if (config_enabled(CONFIG_X86_32))
 		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
 	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave));
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
 	else {
 		/* Using "rex64; fxsave %0" is broken because, if the memory
 		 * operand uses any extended registers for addressing, a second
@@ -290,9 +290,11 @@ static inline int fpu_restore_checking(struct fpu *fpu)
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. "m" is a random variable that should be in L1 */
+	/*
+	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+	 * pending. Clear the x87 state here by setting it to fixed values.
+	 * "m" is a random variable that should be in L1.
+	 */
 	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
 		asm volatile(
 			"fnclex\n\t"
-- 
cgit 


From 959274753857efe9c5f1ba35fe727f51e9aa128d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Wed, 19 Nov 2014 17:41:09 -0800
Subject: x86, traps: Track entry into and exit from IST context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently pretend that IST context is like standard exception
context, but this is incorrect.  IST entries from userspace are like
standard exceptions except that they use per-cpu stacks, so they are
atomic.  IST entries from kernel space are like NMIs from RCU's
perspective -- they are not quiescent states even if they
interrupted the kernel during a quiescent state.

Add and use ist_enter and ist_exit to track IST context.  Even
though x86_32 has no IST stacks, we track these interrupts the same
way.

This fixes two issues:

 - Scheduling from an IST interrupt handler will now warn.  It would
   previously appear to work as long as we got lucky and nothing
   overwrote the stack frame.  (I don't know of any bugs in this
   that would trigger the warning, but it's good to be on the safe
   side.)

 - RCU handling in IST context was dangerous.  As far as I know,
   only machine checks were likely to trigger this, but it's good to
   be on the safe side.

Note that the machine check handlers appears to have been missing
any context tracking at all before this patch.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/traps.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 707adc6549d8..3cf525ec762d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_TRAPS_H
 #define _ASM_X86_TRAPS_H
 
+#include <linux/context_tracking_state.h>
 #include <linux/kprobes.h>
 
 #include <asm/debugreg.h>
@@ -110,6 +111,9 @@ asmlinkage void smp_thermal_interrupt(void);
 asmlinkage void mce_threshold_interrupt(void);
 #endif
 
+extern enum ctx_state ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
-- 
cgit 


From 83653c16da91112236292871b820cb8b367220e3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 13 Nov 2014 15:57:07 -0800
Subject: x86: Clean up current_stack_pointer

There's no good reason for it to be a macro, and x86_64 will want to
use it, so it should be in a header.

Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/thread_info.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 547e344a6dc6..8b13b0fbda8e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -170,6 +170,17 @@ static inline struct thread_info *current_thread_info(void)
 	return ti;
 }
 
+static inline unsigned long current_stack_pointer(void)
+{
+	unsigned long sp;
+#ifdef CONFIG_X86_64
+	asm("mov %%rsp,%0" : "=g" (sp));
+#else
+	asm("mov %%esp,%0" : "=g" (sp));
+#endif
+	return sp;
+}
+
 #else /* !__ASSEMBLY__ */
 
 /* how to get the thread information struct from ASM */
-- 
cgit 


From bced35b65aefe53a6f77a9ed0ce1aea86e9d65a2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Wed, 19 Nov 2014 17:59:41 -0800
Subject: x86, traps: Add ist_begin_non_atomic and ist_end_non_atomic

In some IST handlers, if the interrupt came from user mode,
we can safely enable preemption.  Add helpers to do it safely.

This is intended to be used my the memory failure code in
do_machine_check.

Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/traps.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 3cf525ec762d..4e49d7dff78e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -113,6 +113,8 @@ asmlinkage void mce_threshold_interrupt(void);
 
 extern enum ctx_state ist_enter(struct pt_regs *regs);
 extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_begin_non_atomic(struct pt_regs *regs);
+extern void ist_end_non_atomic(void);
 
 /* Interrupts/Exceptions */
 enum {
-- 
cgit 


From d4812e169de44f4ab53ff671c6193c67de24da62 Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Mon, 5 Jan 2015 16:44:42 -0800
Subject: x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks

We now switch to the kernel stack when a machine check interrupts
during user mode.  This means that we can perform recovery actions
in the tail of do_machine_check()

Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/mce.h         | 1 -
 arch/x86/include/asm/thread_info.h | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..9b3de99dc004 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -190,7 +190,6 @@ enum mcp_flags {
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
-void mce_notify_process(void);
 
 DECLARE_PER_CPU(struct mce, injectm);
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8b13b0fbda8e..e82e95abc92b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -75,7 +75,6 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
-#define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -100,7 +99,6 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
@@ -140,7 +138,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |				\
 	 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
 
 /* flags to check in __switch_to() */
-- 
cgit 


From 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Dec 2014 09:08:14 -0500
Subject: KVM: x86: add method to test PIR bitmap vector

kvm_x86_ops->test_posted_interrupt() returns true/false depending
whether 'vector' is set.

Next patch makes use of this interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..cb19d05af3cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -753,6 +753,7 @@ struct kvm_x86_ops {
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+	bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
-- 
cgit 


From c205fb7d7d4f81e46fc577b707ceb9e356af1456 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Thu, 25 Dec 2014 02:52:16 +0200
Subject: KVM: x86: #PF error-code on R/W operations is wrong

When emulating an instruction that reads the destination memory operand (i.e.,
instructions without the Mov flag in the emulator), the operand is first read.
If a page-fault is detected in this phase, the error-code which would be
delivered to the VM does not indicate that the access that caused the exception
is a write one. This does not conform with real hardware, and may cause the VM
to enter the page-fault handler twice for no reason (once for read, once for
write).

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb19d05af3cd..97a5dd0222c8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -160,6 +160,18 @@ enum {
 #define DR7_FIXED_1	0x00000400
 #define DR7_VOLATILE	0xffff2bff
 
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
-- 
cgit 


From b9dfe0bed999d23ee8838d389637dd8aef83fafa Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 9 Jan 2015 10:53:21 +0100
Subject: livepatch: handle ancient compilers with more grace

We are aborting a build in case when gcc doesn't support fentry on x86_64
(regs->ip modification can't really reliably work with mcount).

This however breaks allmodconfig for people with older gccs that don't
support -mfentry.

Turn the build-time failure into runtime failure, resulting in the whole
infrastructure not being initialized if CC_USING_FENTRY is unset.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/livepatch.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index b5608d7757fd..26e58134c8cb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -25,9 +25,13 @@
 #include <linux/ftrace.h>
 
 #ifdef CONFIG_LIVE_PATCHING
+static inline int klp_check_compiler_support(void)
+{
 #ifndef CC_USING_FENTRY
-#error Your compiler must support -mfentry for live patching to work
+	return 1;
 #endif
+	return 0;
+}
 extern int klp_write_module_reloc(struct module *mod, unsigned long type,
 				  unsigned long loc, unsigned long value);
 
-- 
cgit 


From e182c570e9953859aee5cb016583217d9e68ea18 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 12 Dec 2014 01:56:04 +0200
Subject: x86/uaccess: fix sparse errors

virtio wants to read bitwise types from userspace using get_user.  At the
moment this triggers sparse errors, since the value is passed through an
integer.

Fix that up using __force.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 0d592e0a5b84..ace9dec050b1 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 	asm volatile("call __get_user_%P3"				\
 		     : "=a" (__ret_gu), "=r" (__val_gu)			\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
-	(x) = (__typeof__(*(ptr))) __val_gu;				\
+	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
 	__ret_gu;							\
 })
 
-- 
cgit 


From af9cfe270dd133f2f40c287e8d41919bb8d063e3 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 8 Jan 2015 17:25:12 +0100
Subject: x86: entry_64.S: delete unused code

A define, two macros and an unreferenced bit of assembly are gone.

Acked-by: Borislav Petkov <bp@suse.de>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Oleg Nesterov <oleg@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: X86 ML <x86@kernel.org>
CC: Alexei Starovoitov <ast@plumgrid.com>
CC: Will Drewry <wad@chromium.org>
CC: Kees Cook <keescook@chromium.org>
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/calling.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 76659b67fd11..1f1297b46f83 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
 #define SS		160
 
 #define ARGOFFSET	R11
-#define SWFRAME		ORIG_RAX
 
 	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
 	subq  $9*8+\addskip, %rsp
-- 
cgit 


From a1dafe857db56c35878c71560089a4694ac841fd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Jan 2015 15:31:28 +0800
Subject: iommu, x86: Restructure setup of the irq remapping feature

enable_IR_x2apic() calls setup_irq_remapping_ops() which by default
installs the intel dmar remapping ops and then calls the amd iommu irq
remapping prepare callback to figure out whether we are running on an
AMD machine with irq remapping hardware.

Right after that it calls irq_remapping_prepare() which pointlessly
checks:
	if (!remap_ops || !remap_ops->prepare)
               return -ENODEV;
and then calls

    remap_ops->prepare()

which is silly in the AMD case as it got called from
setup_irq_remapping_ops() already a few microseconds ago.

Simplify this and just collapse everything into
irq_remapping_prepare().

The irq_remapping_prepare() remains still silly as it assigns blindly
the intel ops, but that's not scope of this patch.

The scope here is to move the preperatory work, i.e. memory
allocations out of the atomic section which is required to enable irq
remapping.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Borislav Petkov <bp@alien8.de>
Acked-and-tested-by: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: Joerg Roedel <jroedel@suse.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Oren Twaig <oren@scalemp.com>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20141205084147.232633738@linutronix.de
Link: http://lkml.kernel.org/r/1420615903-28253-2-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/irq_remapping.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index b7747c4c2cf2..f1b619e5a50d 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,7 +33,6 @@ struct irq_cfg;
 
 #ifdef CONFIG_IRQ_REMAP
 
-extern void setup_irq_remapping_ops(void);
 extern int irq_remapping_supported(void);
 extern void set_irq_remapping_broken(void);
 extern int irq_remapping_prepare(void);
@@ -60,7 +59,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip);
 
 #else  /* CONFIG_IRQ_REMAP */
 
-static inline void setup_irq_remapping_ops(void) { }
 static inline int irq_remapping_supported(void) { return 0; }
 static inline void set_irq_remapping_broken(void) { }
 static inline int irq_remapping_prepare(void) { return -ENODEV; }
-- 
cgit 


From c392f56c946033bd136043079a62b9188888828d Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 7 Jan 2015 15:31:40 +0800
Subject: iommu/irq_remapping: Kill function irq_remapping_supported() and
 related code

Simplify irq_remapping code by killing irq_remapping_supported() and
related interfaces.

Joerg posted a similar patch at https://lkml.org/lkml/2014/12/15/490,
so assume an signed-off from Joerg.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Tested-by: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Rientjes <rientjes@google.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Oren Twaig <oren@scalemp.com>
Link: http://lkml.kernel.org/r/1420615903-28253-14-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/irq_remapping.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f1b619e5a50d..6224d316c405 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,7 +33,6 @@ struct irq_cfg;
 
 #ifdef CONFIG_IRQ_REMAP
 
-extern int irq_remapping_supported(void);
 extern void set_irq_remapping_broken(void);
 extern int irq_remapping_prepare(void);
 extern int irq_remapping_enable(void);
@@ -59,7 +58,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip);
 
 #else  /* CONFIG_IRQ_REMAP */
 
-static inline int irq_remapping_supported(void) { return 0; }
 static inline void set_irq_remapping_broken(void) { }
 static inline int irq_remapping_prepare(void) { return -ENODEV; }
 static inline int irq_remapping_enable(void) { return -ENODEV; }
-- 
cgit 


From e108ff2f8033a417ee3e517d9f8730f665646076 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 15 Jan 2015 15:58:54 -0800
Subject: KVM: x86: switch to kvm_get_dirty_log_protect

We now have a generic function that does most of the work of
kvm_vm_ioctl_get_dirty_log, now use it.

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb19d05af3cd..3ceddf41ca74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -821,9 +821,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-				     struct kvm_memory_slot *slot,
-				     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-- 
cgit 


From bccec2a0a25206cb837e939adab94768a990ffa9 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 6 Jan 2015 22:49:54 +0100
Subject: x86/spinlock: Leftover conversion ACCESS_ONCE->READ_ONCE

commit 78bff1c8684f ("x86/ticketlock: Fix spin_unlock_wait() livelock")
introduced two additional ACCESS_ONCE cases in x86 spinlock.h.
Lets change those as well.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/x86/include/asm/spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..7050d864f520 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -183,10 +183,10 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
-- 
cgit 


From 0e1540208ef34a2246822fa56f751efe23748e7a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Jan 2015 18:39:35 +0200
Subject: x86: pmc_atom: Expose contents of PSS

The PSS register reflects the power state of each island on SoC. It would be
useful to know which of the islands is on or off at the momemnt.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Kumar P. Mahesh <mahesh.kumar.p@intel.com>
Link: http://lkml.kernel.org/r/1421253575-22509-6-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/pmc_atom.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index fc7a17c05d35..bc0fc0866553 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -53,6 +53,28 @@
 /* Sleep state counter is in units of of 32us */
 #define	PMC_TMR_SHIFT		5
 
+/* Power status of power islands */
+#define	PMC_PSS			0x98
+
+#define PMC_PSS_BIT_GBE			BIT(0)
+#define PMC_PSS_BIT_SATA		BIT(1)
+#define PMC_PSS_BIT_HDA			BIT(2)
+#define PMC_PSS_BIT_SEC			BIT(3)
+#define PMC_PSS_BIT_PCIE		BIT(4)
+#define PMC_PSS_BIT_LPSS		BIT(5)
+#define PMC_PSS_BIT_LPE			BIT(6)
+#define PMC_PSS_BIT_DFX			BIT(7)
+#define PMC_PSS_BIT_USH_CTRL		BIT(8)
+#define PMC_PSS_BIT_USH_SUS		BIT(9)
+#define PMC_PSS_BIT_USH_VCCS		BIT(10)
+#define PMC_PSS_BIT_USH_VCCA		BIT(11)
+#define PMC_PSS_BIT_OTG_CTRL		BIT(12)
+#define PMC_PSS_BIT_OTG_VCCS		BIT(13)
+#define PMC_PSS_BIT_OTG_VCCA_CLK	BIT(14)
+#define PMC_PSS_BIT_OTG_VCCA		BIT(15)
+#define PMC_PSS_BIT_USB			BIT(16)
+#define PMC_PSS_BIT_USB_SUS		BIT(17)
+
 /* These registers reflect D3 status of functions */
 #define	PMC_D3_STS_0		0xA0
 
-- 
cgit 


From 14e153ef75eecae8fd0738ffb42120f4962a00cd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 15 Jan 2015 20:19:43 +0100
Subject: x86, fpu: Introduce per-cpu in_kernel_fpu state

interrupted_kernel_fpu_idle() tries to detect if kernel_fpu_begin()
is safe or not. In particular it should obviously deny the nested
kernel_fpu_begin() and this logic looks very confusing.

If use_eager_fpu() == T we rely on a) __thread_has_fpu() check in
interrupted_kernel_fpu_idle(), and b) on the fact that _begin() does
__thread_clear_has_fpu().

Otherwise we demand that the interrupted task has no FPU if it is in
kernel mode, this works because __kernel_fpu_begin() does clts() and
interrupted_kernel_fpu_idle() checks X86_CR0_TS.

Add the per-cpu "bool in_kernel_fpu" variable, and change this code
to check/set/clear it. This allows to do more cleanups and fixes, see
the next changes.

The patch also moves WARN_ON_ONCE() under preempt_disable() just to
make this_cpu_read() look better, this is not really needed. And in
fact I think we should move it into __kernel_fpu_begin().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: matt.fleming@intel.com
Cc: bp@suse.de
Cc: pbonzini@redhat.com
Cc: luto@amacapital.net
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150115191943.GB27332@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/i387.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ed8089d69094..5e275d31802e 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -40,8 +40,8 @@ extern void __kernel_fpu_end(void);
 
 static inline void kernel_fpu_begin(void)
 {
-	WARN_ON_ONCE(!irq_fpu_usable());
 	preempt_disable();
+	WARN_ON_ONCE(!irq_fpu_usable());
 	__kernel_fpu_begin();
 }
 
-- 
cgit 


From 7575637ab293861a799f3bbafe0d8c597389f4e9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 15 Jan 2015 20:20:28 +0100
Subject: x86, fpu: Fix math_state_restore() race with kernel_fpu_begin()

math_state_restore() can race with kernel_fpu_begin() if irq comes
right after __thread_fpu_begin(), __save_init_fpu() will overwrite
fpu->state we are going to restore.

Add 2 simple helpers, kernel_fpu_disable() and kernel_fpu_enable()
which simply set/clear in_kernel_fpu, and change math_state_restore()
to exclude kernel_fpu_begin() in between.

Alternatively we could use local_irq_save/restore, but probably these
new helpers can have more users.

Perhaps they should disable/enable preemption themselves, in this case
we can remove preempt_disable() in __restore_xstate_sig().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: matt.fleming@intel.com
Cc: bp@suse.de
Cc: pbonzini@redhat.com
Cc: luto@amacapital.net
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150115192028.GD27332@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/i387.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 5e275d31802e..6eb6fcb83f63 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -51,6 +51,10 @@ static inline void kernel_fpu_end(void)
 	preempt_enable();
 }
 
+/* Must be called with preempt disabled */
+extern void kernel_fpu_disable(void);
+extern void kernel_fpu_enable(void);
+
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
  * DNA fault but don't modify SSE registers. And these instructions
-- 
cgit 


From 54750f2cf042c42b4223d67b1bd20138464bde0e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 20 Jan 2015 15:54:52 -0200
Subject: KVM: x86: workaround SuSE's 2.6.16 pvclock vs masterclock issue

SuSE's 2.6.16 kernel fails to boot if the delta between tsc_timestamp
and rdtsc is larger than a given threshold:

 * If we get more than the below threshold into the future, we rerequest
 * the real time from the host again which has only little offset then
 * that we need to adjust using the TSC.
 *
 * For now that threshold is 1/5th of a jiffie. That should be good
 * enough accuracy for completely broken systems, but also give us swing
 * to not call out to the host all the time.
 */
#define PVCLOCK_DELTA_MAX ((1000000000ULL / HZ) / 5)

Disable masterclock support (which increases said delta) in case the
boot vcpu does not use MSR_KVM_SYSTEM_TIME_NEW.

Upstreams kernels which support pvclock vsyscalls (and therefore make
use of PVCLOCK_STABLE_BIT) use MSR_KVM_SYSTEM_TIME_NEW.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97a5dd0222c8..177b2f2ff9fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -627,6 +627,8 @@ struct kvm_arch {
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
+
+	bool boot_vcpu_runs_old_kvmclock;
 };
 
 struct kvm_vm_stat {
-- 
cgit 


From cfaa790a3fb8a7efa98f4a6457e19dc3a0db35d3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 15 Jan 2015 09:44:56 +0100
Subject: kvm: Fix CR3_PCID_INVD type on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kvm/emulate.c: In function ‘check_cr_write’:
arch/x86/kvm/emulate.c:3552:4: warning: left shift count >= width of type
    rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;

happens because sizeof(UL) on 32-bit is 4 bytes but we shift it 63 bits
to the left.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 177b2f2ff9fb..4327af53e544 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -51,7 +51,7 @@
 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 
 #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD		 (1UL << 63)
+#define CR3_PCID_INVD		 BIT_64(63)
 #define CR4_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
-- 
cgit 


From 8d80696060eedf49c080c0f2cf39a20ae7e787f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:09 +0000
Subject: x86/apic: Avoid open coded x2apic detection

enable_IR_x2apic() grew a open coded x2apic detection. Implement a
proper helper function which shares the code with the already existing
x2apic_enabled().

Made it use rdmsrl_safe as suggested by Boris.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20150115211702.285038186@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 465b309af254..1a8ba26c2fbd 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -108,6 +108,15 @@ extern u64 native_apic_icr_read(void);
 
 extern int x2apic_mode;
 
+static inline bool apic_is_x2apic_enabled(void)
+{
+	u64 msr;
+
+	if (rdmsrl_safe(MSR_IA32_APICBASE, &msr))
+		return false;
+	return msr & X2APIC_ENABLE;
+}
+
 #ifdef CONFIG_X86_X2APIC
 /*
  * Make previous memory operations globally visible before
@@ -175,15 +184,7 @@ extern void check_x2apic(void);
 extern void enable_x2apic(void);
 static inline int x2apic_enabled(void)
 {
-	u64 msr;
-
-	if (!cpu_has_x2apic)
-		return 0;
-
-	rdmsrl(MSR_IA32_APICBASE, msr);
-	if (msr & X2APIC_ENABLE)
-		return 1;
-	return 0;
+	return cpu_has_x2apic && apic_is_x2apic_enabled();
 }
 
 #define x2apic_supported()	(cpu_has_x2apic)
-- 
cgit 


From 81a46dd8249d7fa72a8557e58a38aa984e6b5e16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:11 +0000
Subject: x86/apic: Make x2apic_mode depend on CONFIG_X86_X2APIC

No point in having a static variable around which is always 0. Let the
compiler optimize code out if disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20150115211702.363274310@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1a8ba26c2fbd..d2225fdc953e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -106,8 +106,6 @@ extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
-extern int x2apic_mode;
-
 static inline bool apic_is_x2apic_enabled(void)
 {
 	u64 msr;
@@ -178,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void)
 	return val;
 }
 
+extern int x2apic_mode;
 extern int x2apic_phys;
 extern int x2apic_preenabled;
 extern void check_x2apic(void);
@@ -210,8 +209,9 @@ static inline void x2apic_force_phys(void)
 {
 }
 
-#define	x2apic_preenabled 0
-#define	x2apic_supported()	0
+#define x2apic_mode		(0)
+#define	x2apic_preenabled	(0)
+#define	x2apic_supported()	(0)
 #endif
 
 extern void enable_IR_x2apic(void);
-- 
cgit 


From 2ca5b40479246087695d9e6343075b47ee6887ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:14 +0000
Subject: x86/ioapic: Check x2apic really

The x2apic_preenabled flag is just a horrible hack and if X2APIC
support is disabled it does not reflect the actual hardware
state. Check the hardware instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211702.541280622@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d2225fdc953e..392bbcf35471 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -178,7 +178,6 @@ static inline u64 native_x2apic_icr_read(void)
 
 extern int x2apic_mode;
 extern int x2apic_phys;
-extern int x2apic_preenabled;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 static inline int x2apic_enabled(void)
@@ -210,7 +209,6 @@ static inline void x2apic_force_phys(void)
 }
 
 #define x2apic_mode		(0)
-#define	x2apic_preenabled	(0)
 #define	x2apic_supported()	(0)
 #endif
 
-- 
cgit 


From d524165cb8dbb2ce5916cd7682236b9324ae2644 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:17 +0000
Subject: x86/apic: Check x2apic early

No point in delaying the x2apic detection for the CONFIG_X86_X2APIC=n
case to enable_IR_x2apic(). We rather detect that before we try to
setup anything there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211702.702479404@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 392bbcf35471..ca8deb484d03 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -178,7 +178,7 @@ static inline u64 native_x2apic_icr_read(void)
 
 extern int x2apic_mode;
 extern int x2apic_phys;
-extern void check_x2apic(void);
+extern void __init check_x2apic(void);
 extern void enable_x2apic(void);
 static inline int x2apic_enabled(void)
 {
-- 
cgit 


From 55eae7de727e9ecc814853ec364fbbb352c337df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:19 +0000
Subject: x86/x2apic: Move code in conditional region

No point in having try_to_enable_x2apic() outside of the
CONFIG_X86_X2APIC section and having inline functions and more ifdefs
to deal with it. Move the code into the existing ifdef section and
remove the inline cruft.

Fixup the printk about not enabling interrupt remapping as suggested
by Boris.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211702.795388613@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index ca8deb484d03..951caa17d8ba 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -186,27 +186,11 @@ static inline int x2apic_enabled(void)
 }
 
 #define x2apic_supported()	(cpu_has_x2apic)
-static inline void x2apic_force_phys(void)
-{
-	x2apic_phys = 1;
-}
 #else
-static inline void disable_x2apic(void)
-{
-}
-static inline void check_x2apic(void)
-{
-}
-static inline void enable_x2apic(void)
-{
-}
-static inline int x2apic_enabled(void)
-{
-	return 0;
-}
-static inline void x2apic_force_phys(void)
-{
-}
+static inline void disable_x2apic(void) { }
+static inline void check_x2apic(void) { }
+static inline void enable_x2apic(void) { }
+static inline int x2apic_enabled(void) { return 0; }
 
 #define x2apic_mode		(0)
 #define	x2apic_supported()	(0)
-- 
cgit 


From 44e25ff9e6912347a1a54c757fc75681d0dc42d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:24 +0000
Subject: x86/x2apic: Disable x2apic from nox2apic setup

There is no point in postponing the hardware disablement of x2apic. It
can be disabled right away in the nox2apic setup function.

Disable it right away and set the state to DISABLED . This allows to
remove all the nox2apic conditionals all over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211703.051214090@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 951caa17d8ba..5d7488e9b66e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -187,7 +187,6 @@ static inline int x2apic_enabled(void)
 
 #define x2apic_supported()	(cpu_has_x2apic)
 #else
-static inline void disable_x2apic(void) { }
 static inline void check_x2apic(void) { }
 static inline void enable_x2apic(void) { }
 static inline int x2apic_enabled(void) { return 0; }
-- 
cgit 


From 659006bf3ae37a08706907ce1a36ddf57c9131d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:26 +0000
Subject: x86/x2apic: Split enable and setup function

enable_x2apic() is a convoluted unreadable mess because it is used for
both enablement in early boot and for setup in cpu_init().

Split the code into x2apic_enable() for enablement and x2apic_setup()
for setup of (secondary cpus). Make use of the new state tracking to
simplify the logic.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211703.129287153@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5d7488e9b66e..ac60c603f8dd 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -179,7 +179,7 @@ static inline u64 native_x2apic_icr_read(void)
 extern int x2apic_mode;
 extern int x2apic_phys;
 extern void __init check_x2apic(void);
-extern void enable_x2apic(void);
+extern void x2apic_setup(void);
 static inline int x2apic_enabled(void)
 {
 	return cpu_has_x2apic && apic_is_x2apic_enabled();
@@ -188,7 +188,7 @@ static inline int x2apic_enabled(void)
 #define x2apic_supported()	(cpu_has_x2apic)
 #else
 static inline void check_x2apic(void) { }
-static inline void enable_x2apic(void) { }
+static inline void x2apic_setup(void) { }
 static inline int x2apic_enabled(void) { return 0; }
 
 #define x2apic_mode		(0)
-- 
cgit 


From f77aa308e5a6144a47311ad6905a1a72bc0014f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:29 +0000
Subject: x86/smpboot: Move smpboot inlines to code

No point for a separate header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211703.304126687@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/smpboot_hooks.h | 68 ------------------------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 arch/x86/include/asm/smpboot_hooks.h

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
deleted file mode 100644
index 0da7409f0bec..000000000000
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
- * which needs to alter them. */
-
-static inline void smpboot_clear_io_apic_irqs(void)
-{
-#ifdef CONFIG_X86_IO_APIC
-	io_apic_irqs = 0;
-#endif
-}
-
-static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-	CMOS_WRITE(0xa, 0xf);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-	local_flush_tlb();
-	pr_debug("1.\n");
-	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
-							start_eip >> 4;
-	pr_debug("2.\n");
-	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
-							start_eip & 0xf;
-	pr_debug("3.\n");
-}
-
-static inline void smpboot_restore_warm_reset_vector(void)
-{
-	unsigned long flags;
-
-	/*
-	 * Install writable page 0 entry to set BIOS data area.
-	 */
-	local_flush_tlb();
-
-	/*
-	 * Paranoid:  Set warm reset code and vector here back
-	 * to default values.
-	 */
-	spin_lock_irqsave(&rtc_lock, flags);
-	CMOS_WRITE(0, 0xf);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
-}
-
-static inline void __init smpboot_setup_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
-	/*
-	 * Here we can be sure that there is an IO-APIC in the system. Let's
-	 * go and set it up:
-	 */
-	if (!skip_ioapic_setup && nr_ioapics)
-		setup_IO_APIC();
-	else {
-		nr_ioapics = 0;
-	}
-#endif
-}
-
-static inline void smpboot_clear_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
-	nr_ioapics = 0;
-#endif
-}
-- 
cgit 


From 8686608336e11276d72d020cb0b67bee70d9a5cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:30 +0000
Subject: x86/ioapic: Provide stub functions for IOAPIC%3Dn

To avoid lots of ifdeffery provide proper stubs for setup_IO_APIC(),
enable_IO_APIC() and setup_ioapic_dest().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20150115211703.397170414@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/io_apic.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index bf006cce9418..2f91685fe1cd 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -279,6 +279,11 @@ static inline void disable_ioapic_support(void) { }
 #define native_ioapic_set_affinity	NULL
 #define native_setup_ioapic_entry	NULL
 #define native_eoi_ioapic_pin		NULL
+
+static inline void setup_IO_APIC(void) { }
+static inline void enable_IO_APIC(void) { }
+static inline void setup_ioapic_dest(void) { }
+
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */
-- 
cgit 


From 05f7e46d2aac359b6bcfc06b302bdd03ca0bcada Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:40 +0000
Subject: x86/smpboot: Move apic init code to apic.c

We better provide proper functions which implement the required code
flow in the apic code rather than letting the smpboot code open code
it. That allows to make more functions static and confines the APIC
functionality to apic.c where it belongs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20150115211703.907616730@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index ac60c603f8dd..92f34042be85 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -201,7 +201,6 @@ extern int get_physical_broadcast(void);
 
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
-extern void connect_bsp_APIC(void);
 extern void disconnect_bsp_APIC(int virt_wire_setup);
 extern void disable_local_APIC(void);
 extern void lapic_shutdown(void);
@@ -209,8 +208,6 @@ extern int verify_local_APIC(void);
 extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
-extern void end_local_APIC_setup(void);
-extern void bsp_end_local_APIC_setup(void);
 extern void init_apic_mappings(void);
 void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
@@ -218,6 +215,9 @@ extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern int apic_force_enable(unsigned long addr);
 
+extern int apic_bsp_setup(void);
+extern void apic_ap_setup(void);
+
 /*
  * On 32bit this is mach-xxx local
  */
-- 
cgit 


From 374aab339f10f0510cec0e79d752d31d84b08aa2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:44 +0000
Subject: x86/apic: Reuse apic_bsp_setup() for UP APIC setup

Extend apic_bsp_setup() so the same code flow can be used for
APIC_init_uniprocessor().

Folded Jiangs fix to provide proper ordering of the UP setup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211704.084765674@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 92f34042be85..92003f3c8a42 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -215,7 +215,7 @@ extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern int apic_force_enable(unsigned long addr);
 
-extern int apic_bsp_setup(void);
+extern int apic_bsp_setup(bool upmode);
 extern void apic_ap_setup(void);
 
 /*
-- 
cgit 


From 801806d956c2c198b9fdd3afd156a536f9a3a139 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@cs.technion.ac.il>
Date: Mon, 26 Jan 2015 09:32:23 +0200
Subject: KVM: x86: IRET emulation does not clear NMI masking

The IRET instruction should clear NMI masking, but the current implementation
does not do so.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
 
 	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
 			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
-- 
cgit 


From 853d0289340026b30f93fd0e768340221d4e605c Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 5 Jan 2015 14:13:41 +0000
Subject: xen/grant-table: pre-populate kernel unmap ops for
 xen_gnttab_unmap_refs()

When unmapping grants, instead of converting the kernel map ops to
unmap ops on the fly, pre-populate the set of unmap ops.

This allows the grant unmap for the kernel mappings to be trivially
batched in the future.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/include/asm/xen/page.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5eea09915a15..e9f52fe2d56a 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -55,7 +55,7 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-				     struct gnttab_map_grant_ref *kmap_ops,
+				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
-- 
cgit 


From 0bb599fd30108883b00c7d4a226eeb49111e6932 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 5 Jan 2015 17:06:01 +0000
Subject: xen: remove scratch frames for ballooned pages and m2p override

The scratch frame mappings for ballooned pages and the m2p override
are broken.  Remove them in preparation for replacing them with
simpler mechanisms that works.

The scratch pages did not ensure that the page was not in use.  In
particular, the foreign page could still be in use by hardware.  If
the guest reused the frame the hardware could read or write that
frame.

The m2p override did not handle the same frame being granted by two
different grant references.  Trying an M2P override lookup in this
case is impossible.

With the m2p override removed, the grant map/unmap for the kernel
mappings (for x86 PV) can be easily batched in
set_foreign_p2m_mapping() and clear_foreign_p2m_mapping().

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/include/asm/xen/page.h | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e9f52fe2d56a..358dcd338915 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -57,7 +57,6 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 				     struct gnttab_unmap_grant_ref *kunmap_ops,
 				     struct page **pages, unsigned int count);
-extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
 /*
  * Helper functions to write or read unsigned long values to/from
@@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 		return mfn;
 
 	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (__pfn_to_mfn(pfn) != mfn) {
-		/*
-		 * If this appears to be a foreign mfn (because the pfn
-		 * doesn't map back to the mfn), then check the local override
-		 * table to see if there's a better pfn to use.
-		 *
-		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
-		 */
-		pfn = m2p_find_override_pfn(mfn, ~0);
-	}
+	if (__pfn_to_mfn(pfn) != mfn)
+		pfn = ~0;
 
 	/*
-	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
-	 * entry doesn't map back to the mfn and m2p_override doesn't have a
-	 * valid entry for it.
+	 * pfn is ~0 if there are no entries in the m2p for mfn or the
+	 * entry doesn't map back to the mfn.
 	 */
 	if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
 		pfn = mfn;
-- 
cgit 


From f4b4b1808690c37c7c703d43789c1988c5e7fdeb Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:24 +0800
Subject: KVM: MMU: Add mmu help functions to support PML

This patch adds new mmu layer functions to clear/set D-bit for memory slot, and
to write protect superpages for memory slot.

In case of PML, CPU logs the dirty GPA automatically to PML buffer when CPU
updates D-bit from 0 to 1, therefore we don't have to write protect 4K pages,
instead, we only need to clear D-bit in order to log that GPA.

For superpages, we still write protect it and let page fault code to handle
dirty page logging, as we still need to split superpage to 4K pages in PML.

As PML is always enabled during guest's lifetime, to eliminate unnecessary PML
GPA logging, we set D-bit manually for the slot with dirty logging disabled.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 843bea0e70fd..4f6369b6f7d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -835,6 +835,15 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+					struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+			    struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+				   struct kvm_memory_slot *slot,
+				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-- 
cgit 


From 1c91cad42366ce0799ca17e7ad6995418741d012 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:26 +0800
Subject: KVM: x86: Change parameter of kvm_mmu_slot_remove_write_access

This patch changes the second parameter of kvm_mmu_slot_remove_write_access from
'slot id' to 'struct kvm_memory_slot *' to align with kvm_x86_ops dirty logging
hooks, which will be introduced in further patch.

Better way is to change second parameter of kvm_arch_commit_memory_region from
'struct kvm_userspace_memory_region *' to 'struct kvm_memory_slot * new', but it
requires changes on other non-x86 ARCH too, so avoid it now.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f6369b6f7d2..67a98d793bf2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -834,7 +834,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
-- 
cgit 


From 88178fd4f7187bbe290c5d373fd44aabec891934 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:27 +0800
Subject: KVM: x86: Add new dirty logging kvm_x86_ops for PML

This patch adds new kvm_x86_ops dirty logging hooks to enable/disable dirty
logging for particular memory slot, and to flush potentially logged dirty GPAs
before reporting slot->dirty_bitmap to userspace.

kvm x86 common code calls these hooks when they are available so PML logic can
be hidden to VMX specific. SVM won't be impacted as these hooks remain NULL
there.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 67a98d793bf2..57916ecb9b92 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -802,6 +802,31 @@ struct kvm_x86_ops {
 	int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
 	void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+	/*
+	 * Arch-specific dirty logging hooks. These hooks are only supposed to
+	 * be valid if the specific arch has hardware-accelerated dirty logging
+	 * mechanism. Currently only for PML on VMX.
+	 *
+	 *  - slot_enable_log_dirty:
+	 *	called when enabling log dirty mode for the slot.
+	 *  - slot_disable_log_dirty:
+	 *	called when disabling log dirty mode for the slot.
+	 *	also called when slot is created with log dirty disabled.
+	 *  - flush_log_dirty:
+	 *	called before reporting dirty_bitmap to userspace.
+	 *  - enable_log_dirty_pt_masked:
+	 *	called when reenabling log dirty for the GFNs in the mask after
+	 *	corresponding bits are cleared in slot->dirty_bitmap.
+	 */
+	void (*slot_enable_log_dirty)(struct kvm *kvm,
+				      struct kvm_memory_slot *slot);
+	void (*slot_disable_log_dirty)(struct kvm *kvm,
+				       struct kvm_memory_slot *slot);
+	void (*flush_log_dirty)(struct kvm *kvm);
+	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+					   struct kvm_memory_slot *slot,
+					   gfn_t offset, unsigned long mask);
 };
 
 struct kvm_arch_async_pf {
-- 
cgit 


From 843e4330573cc5261ae260ce0b83dc570d8cdc05 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 28 Jan 2015 10:54:28 +0800
Subject: KVM: VMX: Add PML support in VMX

This patch adds PML support in VMX. A new module parameter 'enable_pml' is added
to allow user to enable/disable it manually.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 
 
@@ -121,6 +122,7 @@ enum vmcs_field {
 	GUEST_LDTR_SELECTOR             = 0x0000080c,
 	GUEST_TR_SELECTOR               = 0x0000080e,
 	GUEST_INTR_STATUS               = 0x00000810,
+	GUEST_PML_INDEX			= 0x00000812,
 	HOST_ES_SELECTOR                = 0x00000c00,
 	HOST_CS_SELECTOR                = 0x00000c02,
 	HOST_SS_SELECTOR                = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
 	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
 	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
 	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
+	PML_ADDRESS			= 0x0000200e,
+	PML_ADDRESS_HIGH		= 0x0000200f,
 	TSC_OFFSET                      = 0x00002010,
 	TSC_OFFSET_HIGH                 = 0x00002011,
 	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
-- 
cgit 


From 2e6d015799d523dcce11c7d1465e6feb7b69fab1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 2 Feb 2015 15:26:09 -0200
Subject: KVM: x86: revert "add method to test PIR bitmap vector"

Revert 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd, given
that testing PIR is not necessary anymore.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57916ecb9b92..9dbc7435cbc2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,7 +767,6 @@ struct kvm_x86_ops {
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-	bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
-- 
cgit 


From 874e52086f9f1b9f9bdfbf98cca8506b110b63ba Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Jan 2015 15:17:22 +0200
Subject: x86, mrst: remove Moorestown specific serial drivers

Intel Moorestown platform support was removed few years ago. This is a follow
up which removes Moorestown specific code for the serial devices. It includes
mrst_max3110 and earlyprintk bits.

This was used on SFI (Medfield, Clovertrail) based platforms as well, though
new ones use normal serial interface for the console service.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: David Cohen <david.a.cohen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/intel-mid.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index e34e097b6f9d..705d35708a50 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
-extern struct console early_mrst_console;
-extern void mrst_early_console_init(void);
-
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(const char *);
 
-- 
cgit 


From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Feb 2015 16:45:18 -0600
Subject: livepatch: rename config to CONFIG_LIVEPATCH

Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of
the config and the code more consistent.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 26e58134c8cb..a455a53d789a 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#ifdef CONFIG_LIVE_PATCHING
+#ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
 {
 #ifndef CC_USING_FENTRY
@@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	regs->ip = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
-- 
cgit 


From 375074cc736ab1d89a708c0a8d7baa4a70d5d476 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:07 -0700
Subject: x86: Clean up cr4 manipulation

CR4 manipulation was split, seemingly at random, between direct
(write_cr4) and using a helper (set/clear_in_cr4).  Unfortunately,
the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code,
which only a small subset of users actually wanted.

This patch replaces all cr4 access in functions that don't leave cr4
exactly the way they found it with new helpers cr4_set_bits,
cr4_clear_bits, and cr4_set_bits_and_update_boot.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/495a10bdc9e67016b8fd3945700d46cfd5c12c2f.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 33 ---------------------------------
 arch/x86/include/asm/tlbflush.h  | 37 +++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/virtext.h   |  3 ++-
 3 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0cce0b7..ec1c93588cef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss,
 #define set_iopl_mask native_set_iopl_mask
 #endif /* CONFIG_PARAVIRT */
 
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features |= mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
-	unsigned long cr4;
-
-	mmu_cr4_features &= ~mask;
-	if (trampoline_cr4_features)
-		*trampoline_cr4_features = mmu_cr4_features;
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
-}
-
 typedef struct {
 	unsigned long		seg;
 } mm_segment_t;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..fc0c4bc356ce 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,43 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags.  This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+	mmu_cr4_features |= mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
+	cr4_set_bits(mask);
+}
+
 static inline void __native_flush_tlb(void)
 {
 	native_write_cr3(native_read_cr3());
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..f41e19ca717b 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
 
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/tlbflush.h>
 
 /*
  * VMX functions:
@@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void)
 static inline void cpu_vmxoff(void)
 {
 	asm volatile (ASM_VMX_VMXOFF : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static inline int cpu_vmx_enabled(void)
-- 
cgit 


From 1e02ce4cccdcb9688386e5b8d2c9fa4660b45389 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:08 -0700
Subject: x86: Store a per-cpu shadow copy of CR4

Context switches and TLB flushes can change individual bits of CR4.
CR4 reads take several cycles, so store a shadow copy of CR4 in a
per-cpu variable.

To avoid wasting a cache line, I added the CR4 shadow to
cpu_tlbstate, which is already touched in switch_mm.  The heaviest
users of the cr4 shadow will be switch_mm and __switch_to_xtra, and
__switch_to_xtra is called shortly after switch_mm during context
switch, so the cacheline is likely to be hot.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/3a54dd3353fffbf84804398e00dfdc5b7c1afd7d.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/paravirt.h      |  6 ++---
 arch/x86/include/asm/special_insns.h |  6 ++---
 arch/x86/include/asm/tlbflush.h      | 52 +++++++++++++++++++++++++++---------
 arch/x86/include/asm/virtext.h       |  2 +-
 4 files changed, 46 insertions(+), 20 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 32444ae939ca..965c47d254aa 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
 }
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
 }
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
 	native_write_cr3(x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
 	return native_read_cr4();
 }
 
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
 	return native_read_cr4_safe();
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
 	native_write_cr4(x);
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc0c4bc356ce..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,14 +15,37 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_state {
+#ifdef CONFIG_SMP
+	struct mm_struct *active_mm;
+	int state;
+#endif
+
+	/*
+	 * Access to this CR4 shadow and to H/W CR4 is protected by
+	 * disabling interrupts when modifying either one.
+	 */
+	unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+	this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
 /* Set in this cpu's CR4. */
 static inline void cr4_set_bits(unsigned long mask)
 {
 	unsigned long cr4;
 
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 | mask) != cr4) {
+		cr4 |= mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
 }
 
 /* Clear in this cpu's CR4. */
@@ -30,9 +53,18 @@ static inline void cr4_clear_bits(unsigned long mask)
 {
 	unsigned long cr4;
 
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	if ((cr4 & ~mask) != cr4) {
+		cr4 &= ~mask;
+		this_cpu_write(cpu_tlbstate.cr4, cr4);
+		__write_cr4(cr4);
+	}
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+	return this_cpu_read(cpu_tlbstate.cr4);
 }
 
 /*
@@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 {
 	unsigned long cr4;
 
-	cr4 = native_read_cr4();
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
 	/* clear PGE */
 	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
@@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2
 
-struct tlb_state {
-	struct mm_struct *active_mm;
-	int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
 static inline void reset_lazy_tlbstate(void)
 {
 	this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index f41e19ca717b..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -46,7 +46,7 @@ static inline void cpu_vmxoff(void)
 
 static inline int cpu_vmx_enabled(void)
 {
-	return read_cr4() & X86_CR4_VMXE;
+	return __read_cr4() & X86_CR4_VMXE;
 }
 
 /** Disable VMX if it is enabled on the current CPU
-- 
cgit 


From 22c4bd9fa921c2b1b3f2420d7b9dabbe982f3059 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:09 -0700
Subject: x86: Add a comment clarifying LDT context switching

The code is correct, but only for a rather subtle reason.  This
confused me for quite a while when I read switch_mm, so clarify the
code to avoid confusing other people, too.

TBH, I wouldn't be surprised if this code was only correct by
accident.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/0db86397f968996fb772c443c251415b0b430ddd.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4b75d591eb5e..52c18359f1dc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -55,12 +55,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
-		 * It's possible leave_mm(prev) has been called.  If so,
-		 * then prev->context.ldt could be out of sync with the
-		 * LDT descriptor or the LDT register.  This can only happen
-		 * if prev->context.ldt is non-null, since we never free
-		 * an LDT.  But LDTs can't be shared across mms, so
-		 * prev->context.ldt won't be equal to next->context.ldt.
+		 * It's possible that prev->context.ldt doesn't match
+		 * the LDT register.  This can happen if leave_mm(prev)
+		 * was called and then modify_ldt changed
+		 * prev->context.ldt but suppressed an IPI to this CPU.
+		 * In this case, prev->context.ldt != NULL, because we
+		 * never free an LDT while the mm still exists.  That
+		 * means that next->context.ldt != prev->context.ldt,
+		 * because mms never share an LDT.
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_LDT_nolock(&next->context);
-- 
cgit 


From 7911d3f7af14a614617e38245fedf98a724e46a9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:12 -0700
Subject: perf/x86: Only allow rdpmc if a perf_event is mapped

We currently allow any process to use rdpmc.  This significantly
weakens the protection offered by PR_TSC_DISABLED, and it could be
helpful to users attempting to exploit timing attacks.

Since we can't enable access to individual counters, use a very
coarse heuristic to limit access to rdpmc: allow access only when
a perf_event is mmapped.  This protects seccomp sandboxes.

There is plenty of room to further tighen these restrictions.  For
example, this allows rdpmc for any x86_pmu event, but it's only
useful for self-monitoring tasks.

As a side effect, cap_user_rdpmc will now be false for AMD uncore
events.  This isn't a real regression, since .event_idx is disabled
for these events anyway for the time being.  Whenever that gets
re-added, the cap_user_rdpmc code can be adjusted or refactored
accordingly.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/a2bdb3cf3a1d70c26980d7c6dddfbaa69f3182bf.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu.h         |  2 ++
 arch/x86/include/asm/mmu_context.h | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..09b9620a73b4 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {
 
 	struct mutex lock;
 	void __user *vdso;
+
+	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 52c18359f1dc..89c1fece224e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -18,6 +18,18 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 }
 #endif	/* !CONFIG_PARAVIRT */
 
+#ifdef CONFIG_PERF_EVENTS
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+	if (atomic_read(&mm->context.perf_rdpmc_allowed))
+		cr4_set_bits(X86_CR4_PCE);
+	else
+		cr4_clear_bits(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
 /*
  * Used for LDT copy/destruction.
  */
@@ -52,6 +64,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Stop flush ipis for the previous mm */
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 
+		/* Load per-mm CR4 state */
+		load_mm_cr4(next);
+
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -87,6 +102,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+			load_mm_cr4(next);
 			load_LDT_nolock(&next->context);
 		}
 	}
-- 
cgit 


From a66734297f78707ce39d756b656bfae861d53f62 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Fri, 24 Oct 2014 15:58:13 -0700
Subject: perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks

While perfmon2 is a sufficiently evil library (it pokes MSRs
directly) that breaking it is fair game, it's still useful, so we
might as well try to support it.  This allows users to write 2 to
/sys/devices/cpu/rdpmc to disable all rdpmc protection so that hack
like perfmon2 can continue to work.

At some point, if perf_event becomes fast enough to replace
perfmon2, then this can go.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Vince Weaver <vince@deater.net>
Cc: "hillf.zj" <hillf.zj@alibaba-inc.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/caac3c1c707dcca48ecbc35f4def21495856f479.1414190806.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 89c1fece224e..883f6b933fa4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -19,9 +19,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
 #endif	/* !CONFIG_PARAVIRT */
 
 #ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
 static inline void load_mm_cr4(struct mm_struct *mm)
 {
-	if (atomic_read(&mm->context.perf_rdpmc_allowed))
+	if (static_key_true(&rdpmc_always_available) ||
+	    atomic_read(&mm->context.perf_rdpmc_allowed))
 		cr4_set_bits(X86_CR4_PCE);
 	else
 		cr4_clear_bits(X86_CR4_PCE);
-- 
cgit 


From 1c2b364b225a5a93dbd1f317bd000d2fec2694be Mon Sep 17 00:00:00 2001
From: Tiejun Chen <tiejun.chen@intel.com>
Date: Thu, 5 Feb 2015 17:22:26 +0800
Subject: kvm: remove KVM_MMIO_SIZE

After f78146b0f923, "KVM: Fix page-crossing MMIO", and
87da7e66a405, "KVM: x86: fix vcpu->mmio_fragments overflow",
actually KVM_MMIO_SIZE is gone.

Signed-off-by: Tiejun Chen <tiejun.chen@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9dbc7435cbc2..848947ac6ade 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_MMIO_SIZE 16
-
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
-- 
cgit 


From b4b55cda587442477a3a9f0669e26bba4b7800c0 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:47 +0800
Subject: x86/PCI: Refine the way to release PCI IRQ resources

Some PCI device drivers assume that pci_dev->irq won't change after
calling pci_disable_device() and pci_enable_device() during suspend and
resume.

Commit c03b3b0738a5 ("x86, irq, mpparse: Release IOAPIC pin when
PCI device is disabled") frees PCI IRQ resources when pci_disable_device()
is called and reallocate IRQ resources when pci_enable_device() is
called again. This breaks above assumption. So commit 3eec595235c1
("x86, irq, PCI: Keep IRQ assignment for PCI devices during
suspend/hibernation") and 9eabc99a635a ("x86, irq, PCI: Keep IRQ
assignment for runtime power management") fix the issue by avoiding
freeing/reallocating IRQ resources during PCI device suspend/resume.
They achieve this by checking dev.power.is_prepared and
dev.power.runtime_status.  PM maintainer, Rafael, then pointed out that
it's really an ugly fix which leaking PM internal state information to
IRQ subsystem.

Recently David Vrabel <david.vrabel@citrix.com> also reports an
regression in pciback driver caused by commit cffe0a2b5a34 ("x86, irq:
Keep balance of IOAPIC pin reference count"). Please refer to:
http://lkml.org/lkml/2015/1/14/546

So this patch refine the way to release PCI IRQ resources. Instead of
releasing PCI IRQ resources in pci_disable_device()/
pcibios_disable_device(), we now release it at driver unbinding
notification BUS_NOTIFY_UNBOUND_DRIVER. In other word, we only release
PCI IRQ resources when there's no driver bound to the PCI device, and
it keeps the assumption that pci_dev->irq won't through multiple
invocation of pci_enable_device()/pci_disable_device().

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pci_x86.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 164e3f8d3c3d..fa1195dae425 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
-extern bool mp_should_keep_irq(struct device *dev);
-
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
-- 
cgit 


From f7819512996361280b86259222456fcf15aad926 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 4 Feb 2015 18:20:58 +0100
Subject: kvm: add halt_poll_ns module parameter

This patch introduces a new module parameter for the KVM module; when it
is present, KVM attempts a bit of polling on every HLT before scheduling
itself out via kvm_vcpu_block.

This parameter helps a lot for latency-bound workloads---in particular
I tested it with O_DSYNC writes with a battery-backed disk in the host.
In this case, writes are fast (because the data doesn't have to go all
the way to the platters) but they cannot be merged by either the host or
the guest.  KVM's performance here is usually around 30% of bare metal,
or 50% if you use cache=directsync or cache=writethrough (these
parameters avoid that the guest sends pointless flush requests, and
at the same time they are not slow because of the battery-backed cache).
The bad performance happens because on every halt the host CPU decides
to halt itself too.  When the interrupt comes, the vCPU thread is then
migrated to a new physical CPU, and in general the latency is horrible
because the vCPU thread has to be scheduled back in.

With this patch performance reaches 60-65% of bare metal and, more
important, 99% of what you get if you use idle=poll in the guest.  This
means that the tunable gets rid of this particular bottleneck, and more
work can be done to improve performance in the kernel or QEMU.

Of course there is some price to pay; every time an otherwise idle vCPUs
is interrupted by an interrupt, it will poll unnecessarily and thus
impose a little load on the host.  The above results were obtained with
a mostly random value of the parameter (500000), and the load was around
1.5-2.5% CPU usage on one of the host's core for each idle guest vCPU.

The patch also adds a new stat, /sys/kernel/debug/kvm/halt_successful_poll,
that can be used to tune the parameter.  It counts how many HLT
instructions received an interrupt during the polling period; each
successful poll avoids that Linux schedules the VCPU thread out and back
in, and may also avoid a likely trip to C1 and back for the physical CPU.

While the VM is idle, a Linux 4 VCPU VM halts around 10 times per second.
Of these halts, almost all are failed polls.  During the benchmark,
instead, basically all halts end within the polling period, except a more
or less constant stream of 50 per second coming from vCPUs that are not
running the benchmark.  The wasted time is thus very low.  Things may
be slightly different for Windows VMs, which have a ~10 ms timer tick.

The effect is also visible on Marcelo's recently-introduced latency
test for the TSC deadline timer.  Though of course a non-RT kernel has
awful latency bounds, the latency of the timer is around 8000-10000 clock
cycles compared to 20000-120000 without setting halt_poll_ns.  For the TSC
deadline timer, thus, the effect is both a smaller average latency and
a smaller variance.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 848947ac6ade..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -655,6 +655,7 @@ struct kvm_vcpu_stat {
 	u32 irq_window_exits;
 	u32 nmi_window_exits;
 	u32 halt_exits;
+	u32 halt_successful_poll;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
-- 
cgit 


From 0a191362058391878cc2a4d4ccddcd8223eb4f79 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:11:22 -0800
Subject: x86: drop _PAGE_FILE and pte_file()-related helpers

We've replaced remap_file_pages(2) implementation with emulation.  Nobody
creates non-linear mapping anymore.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable-2level.h | 38 +----------------------------------
 arch/x86/include/asm/pgtable-3level.h | 12 -----------
 arch/x86/include/asm/pgtable.h        | 20 ------------------
 arch/x86/include/asm/pgtable_64.h     |  6 +-----
 arch/x86/include/asm/pgtable_types.h  |  3 ---
 5 files changed, 2 insertions(+), 77 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 206a87fdd22d..fd74a11959de 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
 	return ((value >> rightshift) & mask) << leftshift;
 }
 
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS	29
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define PTE_FILE_MASK1		((1U << PTE_FILE_BITS1) - 1)
-#define PTE_FILE_MASK2		((1U << PTE_FILE_BITS2) - 1)
-
-#define PTE_FILE_LSHIFT2	(PTE_FILE_BITS1)
-#define PTE_FILE_LSHIFT3	(PTE_FILE_BITS1 + PTE_FILE_BITS2)
-
-static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
-{
-	return (pgoff_t)
-		(pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1,  0)		    +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2,  PTE_FILE_LSHIFT2) +
-		 pte_bitop(pte.pte_low, PTE_FILE_SHIFT3,           -1UL,  PTE_FILE_LSHIFT3));
-}
-
-static __always_inline pte_t pgoff_to_pte(pgoff_t off)
-{
-	return (pte_t){
-		.pte_low =
-			pte_bitop(off,                0, PTE_FILE_MASK1,  PTE_FILE_SHIFT1) +
-			pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2,  PTE_FILE_SHIFT2) +
-			pte_bitop(off, PTE_FILE_LSHIFT3,           -1UL,  PTE_FILE_SHIFT3) +
-			_PAGE_FILE,
-	};
-}
-
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 81bb91b49a88..cdaa58c9b39e 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off)						\
-	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS       32
-
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8a5454acc99..0fe03f834fb1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte)
 	return pte_flags(pte) & _PAGE_RW;
 }
 
-static inline int pte_file(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_FILE;
-}
-
 static inline int pte_huge(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_PSE;
@@ -329,21 +324,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4572b2f30237..e227970f983e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* PUD - Level3 access */
 
 /* PMD  - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
-					    _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
 
 /* PTE - Level 1 access. */
 
@@ -145,7 +141,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
 /* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #ifdef CONFIG_NUMA_BALANCING
 /* Automatic NUMA balancing needs to be distinguishable from swap entries */
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 25bcd4a89517..5185a4f599ec 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -38,8 +38,6 @@
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -114,7 +112,6 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-- 
cgit 


From d9bab50aa46ce46dd4537d455eb13b200cdac516 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Feb 2015 15:28:01 +1030
Subject: lguest: remove NOTIFY call and eventfd facility.

Disappointing, as this was kind of neat (especially getting to use RCU
to manage the address -> eventfd mapping).  But now the devices are PCI
handled in userspace, we get rid of both the NOTIFY hypercall and
the interface to connect an eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
-#define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
 #define LHCALL_SEND_INTERRUPTS	19
 
-- 
cgit 


From d016bf7ece53b2b947bfd769e0842fd2feb7556b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:26:41 -0800
Subject: mm: make FIRST_USER_ADDRESS unsigned long on all archs

LKP has triggered a compiler warning after my recent patch "mm: account
pmd page tables to the process":

    mm/mmap.c: In function 'exit_mmap':
 >> mm/mmap.c:2857:2: warning: right shift count >= width of type [enabled by default]

The code:

 > 2857                WARN_ON(mm_nr_pmds(mm) >
   2858                                round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);

In this, on tile, we have FIRST_USER_ADDRESS defined as 0.  round_up() has
the same type -- int.  PUD_SHIFT.

I think the best way to fix it is to define FIRST_USER_ADDRESS as unsigned
long.  On every arch for consistency.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 5185a4f599ec..3e0230c94cff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -4,7 +4,7 @@
 #include <linux/const.h>
 #include <asm/page_types.h>
 
-#define FIRST_USER_ADDRESS	0
+#define FIRST_USER_ADDRESS	0UL
 
 #define _PAGE_BIT_PRESENT	0	/* is present */
 #define _PAGE_BIT_RW		1	/* writeable */
-- 
cgit 


From e7bb4b6d1609cce391af1e7bc6f31d14f1a3a890 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:19 -0800
Subject: mm: add p[te|md] protnone helpers for use by NUMA balancing

This is a preparatory patch that introduces protnone helpers for automatic
NUMA balancing.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0fe03f834fb1..f519b0b529dd 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -483,6 +483,22 @@ static inline int pmd_present(pmd_t pmd)
 				 _PAGE_NUMA);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static inline int pmd_none(pmd_t pmd)
 {
 	/* Only check low word on 32-bit platforms, since it might be
-- 
cgit 


From 21d9ee3eda7792c45880b2f11bff8e95c9a061fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:32 -0800
Subject: mm: remove remaining references to NUMA hinting bits and helpers

This patch removes the NUMA PTE bits and associated helpers.  As a
side-effect it increases the maximum possible swap space on x86-64.

One potential source of problems is races between the marking of PTEs
PROT_NONE, NUMA hinting faults and migration.  It must be guaranteed that
a PTE being protected is not faulted in parallel, seen as a pte_none and
corrupting memory.  The base case is safe but transhuge has problems in
the past due to an different migration mechanism and a dependance on page
lock to serialise migrations and warrants a closer look.

task_work hinting update			parallel fault
------------------------			--------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						pmd_none
						  do_huge_pmd_anonymous_page
						  read? pmd_lock blocks until hinting complete, fail !pmd_none test
						  write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none
      pmd_modify
      set_pmd_at

task_work hinting update			parallel migration
------------------------			------------------
change_pmd_range
  change_huge_pmd
    __pmd_trans_huge_lock
      pmdp_get_and_clear
						__handle_mm_fault
						  do_huge_pmd_numa_page
						    migrate_misplaced_transhuge_page
						    pmd_lock waits for updates to complete, recheck pmd_same
      pmd_modify
      set_pmd_at

Both of those are safe and the case where a transhuge page is inserted
during a protection update is unchanged.  The case where two processes try
migrating at the same time is unchanged by this series so should still be
ok.  I could not find a case where we are accidentally depending on the
PTE not being cleared and flushed.  If one is missed, it'll manifest as
corruption problems that start triggering shortly after this series is
merged and only happen when NUMA balancing is enabled.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h       | 22 +++----------------
 arch/x86/include/asm/pgtable_64.h    |  5 -----
 arch/x86/include/asm/pgtable_types.h | 41 ++----------------------------------
 3 files changed, 5 insertions(+), 63 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f519b0b529dd..34d42a7d5595 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b)
 }
 
 static inline int pte_present(pte_t a)
-{
-	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-			       _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
 {
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
@@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 	if (pte_flags(a) & _PAGE_PRESENT)
 		return true;
 
-	if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+	if ((pte_flags(a) & _PAGE_PROTNONE) &&
 			mm_tlb_flush_pending(mm))
 		return true;
 
@@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd)
 	 * the _PAGE_PSE flag will remain set at all times while the
 	 * _PAGE_PRESENT bit is clear).
 	 */
-	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-				 _PAGE_NUMA);
+	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-#ifdef CONFIG_NUMA_BALANCING
-	/* pmd_numa check */
-	if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
-		return 0;
-#endif
 	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
 
@@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
 static inline int pte_swp_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
 }
 
 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
-	VM_BUG_ON(pte_present_nonuma(pte));
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 #endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* Encode and de-code a swap entry */
 #define SWP_TYPE_BITS 5
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
-
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -75,21 +67,6 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
-/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA	(_AT(pteval_t, 0))
-#endif
-
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
@@ -122,8 +99,8 @@
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+			 _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
  * The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-- 
cgit 


From c819f37e7e174d68cd013abf33725b4e07ced023 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Feb 2015 14:58:38 -0800
Subject: x86: mm: restore original pte_special check

Commit b38af4721f59 ("x86,mm: fix pte_special versus pte_numa") adjusted
the pte_special check to take into account that a special pte had
SPECIAL and neither PRESENT nor PROTNONE.  Now that NUMA hinting PTEs
are no longer modifying _PAGE_PRESENT it should be safe to restore the
original pte_special behaviour.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 34d42a7d5595..67fc3d2b0aab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -132,13 +132,7 @@ static inline int pte_exec(pte_t pte)
 
 static inline int pte_special(pte_t pte)
 {
-	/*
-	 * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
-	 * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
-	 * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
-	 */
-	return (pte_flags(pte) & _PAGE_SPECIAL) &&
-		(pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
+	return pte_flags(pte) & _PAGE_SPECIAL;
 }
 
 static inline unsigned long pte_pfn(pte_t pte)
-- 
cgit 


From f56141e3e2d9aabf7e6b89680ab572c2cdbb2a24 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 12 Feb 2015 15:01:14 -0800
Subject: all arches, signal: move restart_block to struct task_struct

If an attacker can cause a controlled kernel stack overflow, overwriting
the restart block is a very juicy exploit target.  This is because the
restart_block is held in the same memory allocation as the kernel stack.

Moving the restart block to struct task_struct prevents this exploit by
making the restart_block harder to locate.

Note that there are other fields in thread_info that are also easy
targets, at least on some architectures.

It's also a decent simplification, since the restart code is more or less
identical on all architectures.

[james.hogan@imgtec.com: metag: align thread_info::supervisor_stack]
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Miller <davem@davemloft.net>
Acked-by: Richard Weinberger <richard@nod.at>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/thread_info.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95abc92b..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
-	struct restart_block    restart_block;
 	void __user		*sysenter_return;
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
-- 
cgit 


From ef7f0d6a6ca8c9e4b27d78895af86c2fbfaeedb2 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:25 -0800
Subject: x86_64: add KASan support

This patch adds arch specific code for kernel address sanitizer.

16TB of virtual addressed used for shadow memory.  It's located in range
[ffffec0000000000 - fffffc0000000000] between vmemmap and %esp fixup
stacks.

At early stage we map whole shadow region with zero page.  Latter, after
pages mapped to direct mapping address range we unmap zero pages from
corresponding shadow (see kasan_map_shadow()) and allocate and map a real
shadow memory reusing vmemmap_populate() function.

Also replace __pa with __pa_nodebug before shadow initialized.  __pa with
CONFIG_DEBUG_VIRTUAL=y make external function call (__phys_addr)
__phys_addr is instrumented, so __asan_load could be called before shadow
area initialized.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/kasan.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 arch/x86/include/asm/kasan.h

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..8b22422fbad8
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START      (KASAN_SHADOW_OFFSET + \
+					(0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+extern pte_t kasan_zero_pte[];
+extern pte_t kasan_zero_pmd[];
+extern pte_t kasan_zero_pud[];
+
+#ifdef CONFIG_KASAN
+void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_init(void);
+#else
+static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
-- 
cgit 


From 393f203f5fd54421fddb1e2a263f64d3876eeadb Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:56 -0800
Subject: x86_64: kasan: add interceptors for memset/memmove/memcpy functions

Recently instrumentation of builtin functions calls was removed from GCC
5.0.  To check the memory accessed by such functions, userspace asan
always uses interceptors for them.

So now we should do this as well.  This patch declares
memset/memmove/memcpy as weak symbols.  In mm/kasan/kasan.c we have our
own implementation of those functions which checks memory before accessing
it.

Default memset/memmove/memcpy now now always have aliases with '__'
prefix.  For files that built without kasan instrumentation (e.g.
mm/slub.c) original mem* replaced (via #define) with prefixed variants,
cause we don't want to check memory accesses there.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/string_64.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..e4661196994e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
 extern void *memcpy(void *to, const void *from, size_t len);
 #else
-extern void *__memcpy(void *to, const void *from, size_t len);
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
 size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
-- 
cgit 


From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Fri, 13 Feb 2015 14:39:59 -0800
Subject: kasan: enable stack instrumentation

Stack instrumentation allows to detect out of bounds memory accesses for
variables allocated on stack.  Compiler adds redzones around every
variable on stack and poisons redzones in function's prologue.

Such approach significantly increases stack usage, so all in-kernel stacks
size were doubled.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_64_types.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
-- 
cgit 


From d6abfdb2022368d8c6c4be3f11a06656601a6cc2 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Fri, 6 Feb 2015 16:44:11 +0530
Subject: x86/spinlocks/paravirt: Fix memory corruption on unlock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Paravirt spinlock clears slowpath flag after doing unlock.
As explained by Linus currently it does:

                prev = *lock;
                add_smp(&lock->tickets.head, TICKET_LOCK_INC);

                /* add_smp() is a full mb() */

                if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
                        __ticket_unlock_slowpath(lock, prev);

which is *exactly* the kind of things you cannot do with spinlocks,
because after you've done the "add_smp()" and released the spinlock
for the fast-path, you can't access the spinlock any more.  Exactly
because a fast-path lock might come in, and release the whole data
structure.

Linus suggested that we should not do any writes to lock after unlock(),
and we can move slowpath clearing to fastpath lock.

So this patch implements the fix with:

 1. Moving slowpath flag to head (Oleg):
    Unlocked locks don't care about the slowpath flag; therefore we can keep
    it set after the last unlock, and clear it again on the first (try)lock.
    -- this removes the write after unlock. note that keeping slowpath flag would
    result in unnecessary kicks.
    By moving the slowpath flag from the tail to the head ticket we also avoid
    the need to access both the head and tail tickets on unlock.

 2. use xadd to avoid read/write after unlock that checks the need for
    unlock_kick (Linus):
    We further avoid the need for a read-after-release by using xadd;
    the prev head value will include the slowpath flag and indicate if we
    need to do PV kicking of suspended spinners -- on modern chips xadd
    isn't (much) more expensive than an add + load.

Result:
 setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest)
 benchmark overcommit %improve
 kernbench  1x           -0.13
 kernbench  2x            0.02
 dbench     1x           -1.77
 dbench     2x           -0.63

[Jeremy: Hinted missing TICKET_LOCK_INC for kick]
[Oleg: Moved slowpath flag to head, ticket_equals idea]
[PeterZ: Added detailed changelog]

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Jones <drjones@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Fernando Luis Vázquez Cao <fernando_b1@lab.ntt.co.jp>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: a.ryabinin@samsung.com
Cc: dave@stgolabs.net
Cc: hpa@zytor.com
Cc: jasowang@redhat.com
Cc: jeremy@goop.org
Cc: paul.gortmaker@windriver.com
Cc: riel@redhat.com
Cc: tglx@linutronix.de
Cc: waiman.long@hp.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/spinlock.h | 94 ++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 48 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..cf87de3fc390 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key);
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-	set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+							__ticket_t head)
+{
+	if (head & TICKET_SLOWPATH_FLAG) {
+		arch_spinlock_t old, new;
+
+		old.tickets.head = head;
+		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+
+		/* try to clear slowpath flag when there are no contenders */
+		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+	}
+}
 
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
-	return lock.tickets.head == lock.tickets.tail;
+	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
 }
 
 /*
@@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 	if (likely(inc.head == inc.tail))
 		goto out;
 
-	inc.tail &= ~TICKET_SLOWPATH_FLAG;
 	for (;;) {
 		unsigned count = SPIN_THRESHOLD;
 
 		do {
-			if (READ_ONCE(lock->tickets.head) == inc.tail)
-				goto out;
+			inc.head = READ_ONCE(lock->tickets.head);
+			if (__tickets_equal(inc.head, inc.tail))
+				goto clear_slowpath;
 			cpu_relax();
 		} while (--count);
 		__ticket_lock_spinning(lock, inc.tail);
 	}
-out:	barrier();	/* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+	__ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 	arch_spinlock_t old, new;
 
 	old.tickets = READ_ONCE(lock->tickets);
-	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
 		return 0;
 
 	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
 	/* cmpxchg is a full barrier, so nothing can move before it */
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-					    arch_spinlock_t old)
-{
-	arch_spinlock_t new;
-
-	BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-	/* Perform the unlock on the "before" copy */
-	old.tickets.head += TICKET_LOCK_INC;
-
-	/* Clear the slowpath flag */
-	new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-	/*
-	 * If the lock is uncontended, clear the flag - use cmpxchg in
-	 * case it changes behind our back though.
-	 */
-	if (new.tickets.head != new.tickets.tail ||
-	    cmpxchg(&lock->head_tail, old.head_tail,
-					new.head_tail) != old.head_tail) {
-		/*
-		 * Lock still has someone queued for it, so wake up an
-		 * appropriate waiter.
-		 */
-		__ticket_unlock_kick(lock, old.tickets.head);
-	}
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	if (TICKET_SLOWPATH_FLAG &&
-	    static_key_false(&paravirt_ticketlocks_enabled)) {
-		arch_spinlock_t prev;
+		static_key_false(&paravirt_ticketlocks_enabled)) {
+		__ticket_t head;
 
-		prev = *lock;
-		add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-		/* add_smp() is a full mb() */
+		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-		if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-			__ticket_unlock_slowpath(lock, prev);
+		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+			head &= ~TICKET_SLOWPATH_FLAG;
+			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+		}
 	} else
 		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
@@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return tmp.tail != tmp.head;
+	return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	tmp.head &= ~TICKET_SLOWPATH_FLAG;
+	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
@@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
 		 */
-		if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
-		    tmp.head != head)
+		if (__tickets_equal(tmp.head, tmp.tail) ||
+				!__tickets_equal(tmp.head, head))
 			break;
 
 		cpu_relax();
-- 
cgit 


From b273c2c2f2d2d13dc0bfa8923d52fbaf8fa56ae8 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Date: Mon, 2 Feb 2015 20:27:11 +0100
Subject: x86/apic: Fix the devicetree build in certain configs

Without this patch:

  LD      init/built-in.o
  arch/x86/built-in.o: In function `dtb_lapic_setup': kernel/devicetree.c:155:
  undefined reference to `apic_force_enable'
  Makefile:923: recipe for target 'vmlinux' failed
  make: *** [vmlinux] Error 1

Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Reviewed-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Jan Beulich <JBeulich@suse.com>
Link: http://lkml.kernel.org/r/1422905231-16067-1-git-send-email-ricardo.ribalda@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 92003f3c8a42..efc3b22d896e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+	return -1;
+}
+#else
 extern int apic_force_enable(unsigned long addr);
+#endif
 
 extern int apic_bsp_setup(bool upmode);
 extern void apic_ap_setup(void);
-- 
cgit 


From 28a375df16c2b6d01227541f3956568995aa5fda Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Fri, 30 Jan 2015 16:29:38 +0000
Subject: x86/intel/quark: Add Isolated Memory Regions for Quark X1000

Intel's Quark X1000 SoC contains a set of registers called
Isolated Memory Regions. IMRs are accessed over the IOSF mailbox
interface. IMRs are areas carved out of memory that define
read/write access rights to the various system agents within the
Quark system. For a given agent in the system it is possible to
specify if that agent may read or write an area of memory
defined by an IMR with a granularity of 1 KiB.

Quark_SecureBootPRM_330234_001.pdf section 4.5 details the
concept of IMRs quark-x1000-datasheet.pdf section 12.7.4 details
the implementation of IMRs in silicon.

eSRAM flush, CPU Snoop write-only, CPU SMM Mode, CPU non-SMM
mode, RMU and PCIe Virtual Channels (VC0 and VC1) can have
individual read/write access masks applied to them for a given
memory region in Quark X1000. This enables IMRs to treat each
memory transaction type listed above on an individual basis and
to filter appropriately based on the IMR access mask for the
memory region. Quark supports eight IMRs.

Since all of the DMA capable SoC components in the X1000 are
mapped to VC0 it is possible to define sections of memory as
invalid for DMA write operations originating from Ethernet, USB,
SD and any other DMA capable south-cluster component on VC0.
Similarly it is possible to mark kernel memory as non-SMM mode
read/write only or to mark BIOS runtime memory as SMM mode
accessible only depending on the particular memory footprint on
a given system.

On an IMR violation Quark SoC X1000 systems are configured to
reset the system, so ensuring that the IMR memory map is
consistent with the EFI provided memory map is critical to
ensure no IMR violations reset the system.

The API for accessing IMRs is based on MTRR code but doesn't
provide a /proc or /sys interface to manipulate IMRs. Defining
the size and extent of IMRs is exclusively the domain of
in-kernel code.

Quark firmware sets up a series of locked IMRs around pieces of
memory that firmware owns such as ACPI runtime data. During boot
a series of unlocked IMRs are placed around items in memory to
guarantee no DMA modification of those items can take place.
Grub also places an unlocked IMR around the kernel boot params
data structure and compressed kernel image. It is necessary for
the kernel to tear down all unlocked IMRs in order to ensure
that the kernel's view of memory passed via the EFI memory map
is consistent with the IMR memory map. Without tearing down all
unlocked IMRs on boot transitory IMRs such as those used to
protect the compressed kernel image will cause IMR violations and system reboots.

The IMR init code tears down all unlocked IMRs and sets a
protective IMR around the kernel .text and .rodata as one
contiguous block. This sanitizes the IMR memory map with respect
to the EFI memory map and protects the read-only portions of the
kernel from unwarranted DMA access.

Tested-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Reviewed-by: Andy Shevchenko <andy.schevchenko@gmail.com>
Reviewed-by: Darren Hart <dvhart@linux.intel.com>
Reviewed-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: andy.shevchenko@gmail.com
Cc: dvhart@infradead.org
Link: http://lkml.kernel.org/r/1422635379-12476-2-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/imr.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 arch/x86/include/asm/imr.h

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000000..cd2ce4068441
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,60 @@
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH		BIT(31)
+#define IMR_CPU_SNOOP		BIT(30)		/* Applicable only to write */
+#define IMR_RMU			BIT(29)
+#define IMR_VC1_SAI_ID3		BIT(15)
+#define IMR_VC1_SAI_ID2		BIT(14)
+#define IMR_VC1_SAI_ID1		BIT(13)
+#define IMR_VC1_SAI_ID0		BIT(12)
+#define IMR_VC0_SAI_ID3		BIT(11)
+#define IMR_VC0_SAI_ID2		BIT(10)
+#define IMR_VC0_SAI_ID1		BIT(9)
+#define IMR_VC0_SAI_ID0		BIT(8)
+#define IMR_CPU_0		BIT(1)		/* SMM mode */
+#define IMR_CPU			BIT(0)		/* Non SMM mode */
+#define IMR_ACCESS_NONE		0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL	0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL	0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX	0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN		0x400
+#define IMR_MASK		(IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+		  unsigned int rmask, unsigned int wmask, bool lock);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
-- 
cgit 


From f47233c2d34f243ecdaac179c3408a39ff9216a7 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 13 Feb 2015 16:04:55 +0100
Subject: x86/mm/ASLR: Propagate base load address calculation

Commit:

  e2b32e678513 ("x86, kaslr: randomize module base load address")

makes the base address for module to be unconditionally randomized in
case when CONFIG_RANDOMIZE_BASE is defined and "nokaslr" option isn't
present on the commandline.

This is not consistent with how choose_kernel_location() decides whether
it will randomize kernel load base.

Namely, CONFIG_HIBERNATION disables kASLR (unless "kaslr" option is
explicitly specified on kernel commandline), which makes the state space
larger than what module loader is looking at. IOW CONFIG_HIBERNATION &&
CONFIG_RANDOMIZE_BASE is a valid config option, kASLR wouldn't be applied
by default in that case, but module loader is not aware of that.

Instead of fixing the logic in module.c, this patch takes more generic
aproach. It introduces a new bootparam setup data_type SETUP_KASLR and
uses that to pass the information whether kaslr has been applied during
kernel decompression, and sets a global 'kaslr_enabled' variable
accordingly, so that any kernel code (module loading, livepatching, ...)
can make decisions based on its value.

x86 module loader is converted to make use of this flag.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Link: https://lkml.kernel.org/r/alpine.LNX.2.00.1502101411280.10719@pobox.suse.cz
[ Always dump correct kaslr status when panicking ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/page_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..3d43ce36eaba 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <asm/bootparam.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
@@ -51,6 +52,8 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern bool kaslr_enabled;
+
 static inline phys_addr_t get_max_mapped(void)
 {
 	return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
-- 
cgit 


From e3a1f6cac1fe20e7ac01d96c914c25726723a64e Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 19 Feb 2015 13:06:53 +0000
Subject: x86: pte_protnone() and pmd_protnone() must check entry is not
 present

Since _PAGE_PROTNONE aliases _PAGE_GLOBAL it is only valid if
_PAGE_PRESENT is clear.  Make pte_protnone() and pmd_protnone() check
for this.

This fixes a 64-bit Xen PV guest regression introduced by 8a0516ed8b90
("mm: convert p[te|md]_numa users to p[te|md]_protnone_numa").  Any
userspace process would endlessly fault.

In a 64-bit PV guest, userspace page table entries have _PAGE_GLOBAL set
by the hypervisor.  This meant that any fault on a present userspace
entry (e.g., a write to a read-only mapping) would be misinterpreted as
a NUMA hinting fault and the fault would not be correctly handled,
resulting in the access endlessly faulting.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 67fc3d2b0aab..a0c35bf6cb92 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd)
  */
 static inline int pte_protnone(pte_t pte)
 {
-	return pte_flags(pte) & _PAGE_PROTNONE;
+	return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
 }
 
 static inline int pmd_protnone(pmd_t pmd)
 {
-	return pmd_flags(pmd) & _PAGE_PROTNONE;
+	return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+		== _PAGE_PROTNONE;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-- 
cgit 


From 570e1aa84c376ff39809442f09c7606ddf62cfd1 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 20 Feb 2015 10:18:59 +0100
Subject: x86/mm/ASLR: Avoid PAGE_SIZE redefinition for UML subarch

Commit f47233c2d34 ("x86/mm/ASLR: Propagate base load address
calculation") causes PAGE_SIZE redefinition warnings for UML
subarch  builds. This is caused by added includes that were
leftovers from previous  patch versions are are not actually
needed (especially page_types.h  inlcude in module.c). Drop
those stray includes.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1502201017240.28769@pobox.suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_types.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 3d43ce36eaba..95e11f79f123 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,7 +3,6 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
-#include <asm/bootparam.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
-- 
cgit 


From 06c8173eb92bbfc03a0fe8bb64315857d0badd06 Mon Sep 17 00:00:00 2001
From: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Date: Thu, 5 Mar 2015 13:19:22 +0100
Subject: x86/fpu/xsaves: Fix improper uses of __ex_table

Commit:

  f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area")

introduced alternative instructions for XSAVES/XRSTORS and commit:

  adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time")

added support for the XSAVES/XRSTORS instructions at boot time.

Unfortunately both failed to properly protect them against faulting:

The 'xstate_fault' macro will use the closest label named '1'
backward and that ends up in the .altinstr_replacement section
rather than in .text. This means that the kernel will never find
in the __ex_table the .text address where this instruction might
fault, leading to serious problems if userspace manages to
trigger the fault.

Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Jamie Iles <jamie.iles@oracle.com>
[ Improved the changelog, fixed some whitespace noise. ]
Acked-by: Borislav Petkov <bp@alien8.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Cc: Allan Xavier <mr.a.xavier@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time")
Fixes: f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/xsave.h | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 5fa9770035dc..c9a6d68b8d62 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -82,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XSAVES"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XSAVE"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -112,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XRSTORS"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XRSTOR"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -149,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input_2(
 		"1:"XSAVE,
-		"1:"XSAVEOPT,
+		XSAVEOPT,
 		X86_FEATURE_XSAVEOPT,
-		"1:"XSAVES,
+		XSAVES,
 		X86_FEATURE_XSAVES,
 		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
 		"memory");
@@ -178,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input(
 		"1: " XRSTOR,
-		"1: " XRSTORS,
+		XRSTORS,
 		X86_FEATURE_XSAVES,
 		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 		: "memory");
-- 
cgit 


From f4c3686386393c120710dd34df2a74183ab805fd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 13 Mar 2015 09:53:10 +0100
Subject: x86/fpu: Drop_fpu() should not assume that tsk equals current

drop_fpu() does clear_used_math() and usually this is correct
because tsk == current.

However switch_fpu_finish()->restore_fpu_checking() is called before
__switch_to() updates the "current_task" variable. If it fails,
we will wrongly clear the PF_USED_MATH flag of the previous task.

So use clear_stopped_child_used_math() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150309171041.GB11388@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 0dbc08282291..72ba21a8b5fc 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -370,7 +370,7 @@ static inline void drop_fpu(struct task_struct *tsk)
 	preempt_disable();
 	tsk->thread.fpu_counter = 0;
 	__drop_fpu(tsk);
-	clear_used_math();
+	clear_stopped_child_used_math(tsk);
 	preempt_enable();
 }
 
-- 
cgit 


From 69797dafe35541bfff1989c0b37c66ed785faf0e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 16 Mar 2015 11:06:28 +0100
Subject: Revert "x86/mm/ASLR: Propagate base load address calculation"

This reverts commit:

  f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation")

The main reason for the revert is that the new boot flag does not work
at all currently, and in order to make this work, we need non-trivial
changes to the x86 boot code which we didn't manage to get done in
time for merging.

And even if we did, they would've been too risky so instead of
rushing things and break booting 4.1 on boxes left and right, we
will be very strict and conservative and will take our time with
this to fix and test it properly.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: H. Peter Anvin <hpa@linux.intel.com
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Junjie Mao <eternal.n08@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/20150316100628.GD22995@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_types.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 95e11f79f123..f97fbe3abb67 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,8 +51,6 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
-extern bool kaslr_enabled;
-
 static inline phys_addr_t get_max_mapped(void)
 {
 	return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
-- 
cgit 


From 9e8ce4b96b781b003e3174fbbc62e1d4388c8b8f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 20 Mar 2015 14:56:19 +0100
Subject: Revert "x86/PCI: Refine the way to release PCI IRQ resources"

Commit b4b55cda5874 (Refine the way to release PCI IRQ resources)
introduced a regression in the PCI IRQ resource management by causing
the IRQ resource of a device, established when pci_enabled_device()
is called on a fully disabled device, to be released when the driver
is unbound from the device, regardless of the enable_cnt.

This leads to the situation that an ill-behaved driver can now make a
device unusable to subsequent drivers by an imbalance in their use of
pci_enable/disable_device().  That is a serious problem for secondary
drivers like vfio-pci, which are innocent of the transgressions of
the previous driver.

Since the solution of this problem is not immediate and requires
further discussion, revert commit b4b55cda5874 and the issue it was
supposed to address (a bug related to xen-pciback) will be taken
care of in a different way going forward.

Reported-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pci_x86.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index fa1195dae425..164e3f8d3c3d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
+extern bool mp_should_keep_irq(struct device *dev);
+
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
-- 
cgit 


From cae2a173fe94ab3a437416af6f092fae2e65837e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 Apr 2015 10:26:17 -0700
Subject: x86: clean up/fix 'copy_in_user()' tail zeroing

The rule for 'copy_from_user()' is that it zeroes the remaining kernel
buffer even when the copy fails halfway, just to make sure that we don't
leave uninitialized kernel memory around.  Because even if we check for
errors, some kernel buffers stay around after thge copy (think page
cache).

However, the x86-64 logic for user copies uses a copy_user_generic()
function for all the cases, that set the "zerorest" flag for any fault
on the source buffer.  Which meant that it didn't just try to clear the
kernel buffer after a failure in copy_from_user(), it also tried to
clear the destination user buffer for the "copy_in_user()" case.

Not only is that pointless, it also means that the clearing code has to
worry about the tail clearing taking page faults for the user buffer
case.  Which is just stupid, since that case shouldn't happen in the
first place.

Get rid of the whole "zerorest" thing entirely, and instead just check
if the destination is in kernel space or not.  And then just use
memset() to clear the tail of the kernel buffer if necessary.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/include/asm')

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 12a26b979bf1..f2f9b39b274a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 }
 
 unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+copy_user_handle_tail(char *to, char *from, unsigned len);
 
 #endif /* _ASM_X86_UACCESS_64_H */
-- 
cgit