From d6d55f0b9d900673548515614b56ab55aa2c51f8 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Thu, 29 May 2014 17:26:50 +0200 Subject: perf/x86/amd: AMD support for bp_len > HW_BREAKPOINT_LEN_8 Implement hardware breakpoint address mask for AMD Family 16h and above processors. CPUID feature bit indicates hardware support for DRn_ADDR_MASK MSRs. These masks further qualify DRn/DR7 hardware breakpoint addresses to allow matching of larger addresses ranges. Valuable advice and pseudo code from Oleg Nesterov Signed-off-by: Jacob Shin Signed-off-by: Suravee Suthikulpanit Acked-by: Jiri Olsa Reviewed-by: Oleg Nesterov Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Cc: xiakaixu Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/debugreg.h | 5 +++++ arch/x86/include/asm/hw_breakpoint.h | 1 + 3 files changed, 8 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0bb1335313b2..53966d65591e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -174,6 +174,7 @@ #define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ /* @@ -383,6 +384,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) +#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT) #if __GNUC__ >= 4 extern void warn_pre_alternatives(void); diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 61fd18b83b6c..12cb66f6d3a5 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -114,5 +114,10 @@ static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_dec(void) { } #endif /* X86_64 */ +#ifdef CONFIG_CPU_SUP_AMD +extern void set_dr_addr_mask(unsigned long mask, int dr); +#else +static inline void set_dr_addr_mask(unsigned long mask, int dr) { } +#endif #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index ef1c4d2d41ec..6c98be864a75 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -12,6 +12,7 @@ */ struct arch_hw_breakpoint { unsigned long address; + unsigned long mask; u8 len; u8 type; }; -- cgit From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 16 Dec 2014 11:58:19 -0600 Subject: livepatch: kernel: add support for live patching This commit introduces code for the live patching core. It implements an ftrace-based mechanism and kernel interface for doing live patching of kernel and kernel module functions. It represents the greatest common functionality set between kpatch and kgraft and can accept patches built using either method. This first version does not implement any consistency mechanism that ensures that old and new code do not run together. In practice, ~90% of CVEs are safe to apply in this way, since they simply add a conditional check. However, any function change that can not execute safely with the old version of the function can _not_ be safely applied in this version. [ jkosina@suse.cz: due to the number of contributions that got folded into this original patch from Seth Jennings, add SUSE's copyright as well, as discussed via e-mail ] Signed-off-by: Seth Jennings Signed-off-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Reviewed-by: Petr Mladek Reviewed-by: Masami Hiramatsu Signed-off-by: Miroslav Benes Signed-off-by: Petr Mladek Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 arch/x86/include/asm/livepatch.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h new file mode 100644 index 000000000000..d529db1b1edf --- /dev/null +++ b/arch/x86/include/asm/livepatch.h @@ -0,0 +1,37 @@ +/* + * livepatch.h - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _ASM_X86_LIVEPATCH_H +#define _ASM_X86_LIVEPATCH_H + +#include + +#ifdef CONFIG_LIVE_PATCHING +#ifndef CC_USING_FENTRY +#error Your compiler must support -mfentry for live patching to work +#endif +extern int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); + +#else +#error Live patching support is disabled; check CONFIG_LIVE_PATCHING +#endif + +#endif /* _ASM_X86_LIVEPATCH_H */ -- cgit From b5bfc51707f1b56b0b733980bb4fcc0562bf02d8 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 19 Dec 2014 14:11:17 +0800 Subject: livepatch: move x86 specific ftrace handler code to arch/x86 The execution flow redirection related implemention in the livepatch ftrace handler is depended on the specific architecture. This patch introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change the pt_regs. Signed-off-by: Li Bin Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index d529db1b1edf..b5608d7757fd 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -22,6 +22,7 @@ #define _ASM_X86_LIVEPATCH_H #include +#include #ifdef CONFIG_LIVE_PATCHING #ifndef CC_USING_FENTRY @@ -30,6 +31,10 @@ extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} #else #error Live patching support is disabled; check CONFIG_LIVE_PATCHING #endif -- cgit From 6ca7a8a15035add0a4f9b2fd658118d41dbeb20c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 21 Dec 2014 15:02:23 +0100 Subject: x86/fpu: Use a symbolic name for asm operand Fix up the else-case in fpu_fxsave() which seems like it has been overlooked. Correct comment style in restore_fpu_checking() while at it. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1419170543-11393-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e97622f57722..0dbc08282291 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -207,7 +207,7 @@ static inline void fpu_fxsave(struct fpu *fpu) if (config_enabled(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory * operand uses any extended registers for addressing, a second @@ -290,9 +290,11 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" -- cgit From 959274753857efe9c5f1ba35fe727f51e9aa128d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 19 Nov 2014 17:41:09 -0800 Subject: x86, traps: Track entry into and exit from IST context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently pretend that IST context is like standard exception context, but this is incorrect. IST entries from userspace are like standard exceptions except that they use per-cpu stacks, so they are atomic. IST entries from kernel space are like NMIs from RCU's perspective -- they are not quiescent states even if they interrupted the kernel during a quiescent state. Add and use ist_enter and ist_exit to track IST context. Even though x86_32 has no IST stacks, we track these interrupts the same way. This fixes two issues: - Scheduling from an IST interrupt handler will now warn. It would previously appear to work as long as we got lucky and nothing overwrote the stack frame. (I don't know of any bugs in this that would trigger the warning, but it's good to be on the safe side.) - RCU handling in IST context was dangerous. As far as I know, only machine checks were likely to trigger this, but it's good to be on the safe side. Note that the machine check handlers appears to have been missing any context tracking at all before this patch. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Frédéric Weisbecker Signed-off-by: Andy Lutomirski --- arch/x86/include/asm/traps.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 707adc6549d8..3cf525ec762d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include #include #include @@ -110,6 +111,9 @@ asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +extern enum ctx_state ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ -- cgit From 83653c16da91112236292871b820cb8b367220e3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 13 Nov 2014 15:57:07 -0800 Subject: x86: Clean up current_stack_pointer There's no good reason for it to be a macro, and x86_64 will want to use it, so it should be in a header. Acked-by: Borislav Petkov Signed-off-by: Andy Lutomirski --- arch/x86/include/asm/thread_info.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..8b13b0fbda8e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -170,6 +170,17 @@ static inline struct thread_info *current_thread_info(void) return ti; } +static inline unsigned long current_stack_pointer(void) +{ + unsigned long sp; +#ifdef CONFIG_X86_64 + asm("mov %%rsp,%0" : "=g" (sp)); +#else + asm("mov %%esp,%0" : "=g" (sp)); +#endif + return sp; +} + #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ -- cgit From bced35b65aefe53a6f77a9ed0ce1aea86e9d65a2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 19 Nov 2014 17:59:41 -0800 Subject: x86, traps: Add ist_begin_non_atomic and ist_end_non_atomic In some IST handlers, if the interrupt came from user mode, we can safely enable preemption. Add helpers to do it safely. This is intended to be used my the memory failure code in do_machine_check. Acked-by: Borislav Petkov Signed-off-by: Andy Lutomirski --- arch/x86/include/asm/traps.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 3cf525ec762d..4e49d7dff78e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -113,6 +113,8 @@ asmlinkage void mce_threshold_interrupt(void); extern enum ctx_state ist_enter(struct pt_regs *regs); extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_begin_non_atomic(struct pt_regs *regs); +extern void ist_end_non_atomic(void); /* Interrupts/Exceptions */ enum { -- cgit From d4812e169de44f4ab53ff671c6193c67de24da62 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Mon, 5 Jan 2015 16:44:42 -0800 Subject: x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks We now switch to the kernel stack when a machine check interrupts during user mode. This means that we can perform recovery actions in the tail of do_machine_check() Acked-by: Borislav Petkov Signed-off-by: Tony Luck Signed-off-by: Andy Lutomirski --- arch/x86/include/asm/mce.h | 1 - arch/x86/include/asm/thread_info.h | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 51b26e895933..9b3de99dc004 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -190,7 +190,6 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); -void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8b13b0fbda8e..e82e95abc92b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -75,7 +75,6 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -100,7 +99,6 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -140,7 +138,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ -- cgit From 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 16 Dec 2014 09:08:14 -0500 Subject: KVM: x86: add method to test PIR bitmap vector kvm_x86_ops->test_posted_interrupt() returns true/false depending whether 'vector' is set. Next patch makes use of this interface. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d89c6b828c96..cb19d05af3cd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -753,6 +753,7 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); -- cgit From c205fb7d7d4f81e46fc577b707ceb9e356af1456 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Dec 2014 02:52:16 +0200 Subject: KVM: x86: #PF error-code on R/W operations is wrong When emulating an instruction that reads the destination memory operand (i.e., instructions without the Mov flag in the emulator), the operand is first read. If a page-fault is detected in this phase, the error-code which would be delivered to the VM does not indicate that the access that caused the exception is a write one. This does not conform with real hardware, and may cause the VM to enter the page-fault handler twice for no reason (once for read, once for write). Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cb19d05af3cd..97a5dd0222c8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -160,6 +160,18 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* -- cgit From b9dfe0bed999d23ee8838d389637dd8aef83fafa Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 9 Jan 2015 10:53:21 +0100 Subject: livepatch: handle ancient compilers with more grace We are aborting a build in case when gcc doesn't support fentry on x86_64 (regs->ip modification can't really reliably work with mcount). This however breaks allmodconfig for people with older gccs that don't support -mfentry. Turn the build-time failure into runtime failure, resulting in the whole infrastructure not being initialized if CC_USING_FENTRY is unset. Reported-by: Andrew Morton Signed-off-by: Jiri Kosina Signed-off-by: Andrew Morton Acked-by: Josh Poimboeuf --- arch/x86/include/asm/livepatch.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index b5608d7757fd..26e58134c8cb 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -25,9 +25,13 @@ #include #ifdef CONFIG_LIVE_PATCHING +static inline int klp_check_compiler_support(void) +{ #ifndef CC_USING_FENTRY -#error Your compiler must support -mfentry for live patching to work + return 1; #endif + return 0; +} extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); -- cgit From e182c570e9953859aee5cb016583217d9e68ea18 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 12 Dec 2014 01:56:04 +0200 Subject: x86/uaccess: fix sparse errors virtio wants to read bitwise types from userspace using get_user. At the moment this triggers sparse errors, since the value is passed through an integer. Fix that up using __force. Signed-off-by: Michael S. Tsirkin Acked-by: Thomas Gleixner --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0d592e0a5b84..ace9dec050b1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) asm volatile("call __get_user_%P3" \ : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ - (x) = (__typeof__(*(ptr))) __val_gu; \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) -- cgit From af9cfe270dd133f2f40c287e8d41919bb8d063e3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 8 Jan 2015 17:25:12 +0100 Subject: x86: entry_64.S: delete unused code A define, two macros and an unreferenced bit of assembly are gone. Acked-by: Borislav Petkov CC: Linus Torvalds CC: Oleg Nesterov CC: "H. Peter Anvin" CC: Andy Lutomirski CC: Frederic Weisbecker CC: X86 ML CC: Alexei Starovoitov CC: Will Drewry CC: Kees Cook CC: linux-kernel@vger.kernel.org Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski --- arch/x86/include/asm/calling.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 76659b67fd11..1f1297b46f83 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with #define SS 160 #define ARGOFFSET R11 -#define SWFRAME ORIG_RAX .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 subq $9*8+\addskip, %rsp -- cgit From a1dafe857db56c35878c71560089a4694ac841fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 Jan 2015 15:31:28 +0800 Subject: iommu, x86: Restructure setup of the irq remapping feature enable_IR_x2apic() calls setup_irq_remapping_ops() which by default installs the intel dmar remapping ops and then calls the amd iommu irq remapping prepare callback to figure out whether we are running on an AMD machine with irq remapping hardware. Right after that it calls irq_remapping_prepare() which pointlessly checks: if (!remap_ops || !remap_ops->prepare) return -ENODEV; and then calls remap_ops->prepare() which is silly in the AMD case as it got called from setup_irq_remapping_ops() already a few microseconds ago. Simplify this and just collapse everything into irq_remapping_prepare(). The irq_remapping_prepare() remains still silly as it assigns blindly the intel ops, but that's not scope of this patch. The scope here is to move the preperatory work, i.e. memory allocations out of the atomic section which is required to enable irq remapping. Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Acked-and-tested-by: Joerg Roedel Cc: Tony Luck Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: David Rientjes Cc: HATAYAMA Daisuke Cc: Jan Beulich Cc: Richard Weinberger Cc: Oren Twaig Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20141205084147.232633738@linutronix.de Link: http://lkml.kernel.org/r/1420615903-28253-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b7747c4c2cf2..f1b619e5a50d 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,7 +33,6 @@ struct irq_cfg; #ifdef CONFIG_IRQ_REMAP -extern void setup_irq_remapping_ops(void); extern int irq_remapping_supported(void); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); @@ -60,7 +59,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip); #else /* CONFIG_IRQ_REMAP */ -static inline void setup_irq_remapping_ops(void) { } static inline int irq_remapping_supported(void) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } -- cgit From c392f56c946033bd136043079a62b9188888828d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 7 Jan 2015 15:31:40 +0800 Subject: iommu/irq_remapping: Kill function irq_remapping_supported() and related code Simplify irq_remapping code by killing irq_remapping_supported() and related interfaces. Joerg posted a similar patch at https://lkml.org/lkml/2014/12/15/490, so assume an signed-off from Joerg. Signed-off-by: Jiang Liu Signed-off-by: Joerg Roedel Tested-by: Joerg Roedel Cc: Tony Luck Cc: iommu@lists.linux-foundation.org Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: Borislav Petkov Cc: David Rientjes Cc: HATAYAMA Daisuke Cc: Jan Beulich Cc: Richard Weinberger Cc: Oren Twaig Link: http://lkml.kernel.org/r/1420615903-28253-14-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index f1b619e5a50d..6224d316c405 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,7 +33,6 @@ struct irq_cfg; #ifdef CONFIG_IRQ_REMAP -extern int irq_remapping_supported(void); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); @@ -59,7 +58,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip); #else /* CONFIG_IRQ_REMAP */ -static inline int irq_remapping_supported(void) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } -- cgit From e108ff2f8033a417ee3e517d9f8730f665646076 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 15 Jan 2015 15:58:54 -0800 Subject: KVM: x86: switch to kvm_get_dirty_log_protect We now have a generic function that does most of the work of kvm_vm_ioctl_get_dirty_log, now use it. Acked-by: Christoffer Dall Signed-off-by: Mario Smarduch --- arch/x86/include/asm/kvm_host.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cb19d05af3cd..3ceddf41ca74 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -821,9 +821,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); -- cgit From bccec2a0a25206cb837e939adab94768a990ffa9 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 6 Jan 2015 22:49:54 +0100 Subject: x86/spinlock: Leftover conversion ACCESS_ONCE->READ_ONCE commit 78bff1c8684f ("x86/ticketlock: Fix spin_unlock_wait() livelock") introduced two additional ACCESS_ONCE cases in x86 spinlock.h. Lets change those as well. Signed-off-by: Christian Borntraeger Cc: Oleg Nesterov --- arch/x86/include/asm/spinlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 625660f8a2fc..7050d864f520 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -183,10 +183,10 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - __ticket_t head = ACCESS_ONCE(lock->tickets.head); + __ticket_t head = READ_ONCE(lock->tickets.head); for (;;) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); /* * We need to check "unlocked" in a loop, tmp.head == head * can be false positive because of overflow. -- cgit From 0e1540208ef34a2246822fa56f751efe23748e7a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Jan 2015 18:39:35 +0200 Subject: x86: pmc_atom: Expose contents of PSS The PSS register reflects the power state of each island on SoC. It would be useful to know which of the islands is on or off at the momemnt. Signed-off-by: Andy Shevchenko Acked-by: Aubrey Li Cc: Rafael J. Wysocki Cc: Kumar P. Mahesh Link: http://lkml.kernel.org/r/1421253575-22509-6-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pmc_atom.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h index fc7a17c05d35..bc0fc0866553 100644 --- a/arch/x86/include/asm/pmc_atom.h +++ b/arch/x86/include/asm/pmc_atom.h @@ -53,6 +53,28 @@ /* Sleep state counter is in units of of 32us */ #define PMC_TMR_SHIFT 5 +/* Power status of power islands */ +#define PMC_PSS 0x98 + +#define PMC_PSS_BIT_GBE BIT(0) +#define PMC_PSS_BIT_SATA BIT(1) +#define PMC_PSS_BIT_HDA BIT(2) +#define PMC_PSS_BIT_SEC BIT(3) +#define PMC_PSS_BIT_PCIE BIT(4) +#define PMC_PSS_BIT_LPSS BIT(5) +#define PMC_PSS_BIT_LPE BIT(6) +#define PMC_PSS_BIT_DFX BIT(7) +#define PMC_PSS_BIT_USH_CTRL BIT(8) +#define PMC_PSS_BIT_USH_SUS BIT(9) +#define PMC_PSS_BIT_USH_VCCS BIT(10) +#define PMC_PSS_BIT_USH_VCCA BIT(11) +#define PMC_PSS_BIT_OTG_CTRL BIT(12) +#define PMC_PSS_BIT_OTG_VCCS BIT(13) +#define PMC_PSS_BIT_OTG_VCCA_CLK BIT(14) +#define PMC_PSS_BIT_OTG_VCCA BIT(15) +#define PMC_PSS_BIT_USB BIT(16) +#define PMC_PSS_BIT_USB_SUS BIT(17) + /* These registers reflect D3 status of functions */ #define PMC_D3_STS_0 0xA0 -- cgit From 14e153ef75eecae8fd0738ffb42120f4962a00cd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Jan 2015 20:19:43 +0100 Subject: x86, fpu: Introduce per-cpu in_kernel_fpu state interrupted_kernel_fpu_idle() tries to detect if kernel_fpu_begin() is safe or not. In particular it should obviously deny the nested kernel_fpu_begin() and this logic looks very confusing. If use_eager_fpu() == T we rely on a) __thread_has_fpu() check in interrupted_kernel_fpu_idle(), and b) on the fact that _begin() does __thread_clear_has_fpu(). Otherwise we demand that the interrupted task has no FPU if it is in kernel mode, this works because __kernel_fpu_begin() does clts() and interrupted_kernel_fpu_idle() checks X86_CR0_TS. Add the per-cpu "bool in_kernel_fpu" variable, and change this code to check/set/clear it. This allows to do more cleanups and fixes, see the next changes. The patch also moves WARN_ON_ONCE() under preempt_disable() just to make this_cpu_read() look better, this is not really needed. And in fact I think we should move it into __kernel_fpu_begin(). Signed-off-by: Oleg Nesterov Reviewed-by: Rik van Riel Cc: matt.fleming@intel.com Cc: bp@suse.de Cc: pbonzini@redhat.com Cc: luto@amacapital.net Cc: Linus Torvalds Cc: Suresh Siddha Link: http://lkml.kernel.org/r/20150115191943.GB27332@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i387.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ed8089d69094..5e275d31802e 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -40,8 +40,8 @@ extern void __kernel_fpu_end(void); static inline void kernel_fpu_begin(void) { - WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); + WARN_ON_ONCE(!irq_fpu_usable()); __kernel_fpu_begin(); } -- cgit From 7575637ab293861a799f3bbafe0d8c597389f4e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Jan 2015 20:20:28 +0100 Subject: x86, fpu: Fix math_state_restore() race with kernel_fpu_begin() math_state_restore() can race with kernel_fpu_begin() if irq comes right after __thread_fpu_begin(), __save_init_fpu() will overwrite fpu->state we are going to restore. Add 2 simple helpers, kernel_fpu_disable() and kernel_fpu_enable() which simply set/clear in_kernel_fpu, and change math_state_restore() to exclude kernel_fpu_begin() in between. Alternatively we could use local_irq_save/restore, but probably these new helpers can have more users. Perhaps they should disable/enable preemption themselves, in this case we can remove preempt_disable() in __restore_xstate_sig(). Signed-off-by: Oleg Nesterov Reviewed-by: Rik van Riel Cc: matt.fleming@intel.com Cc: bp@suse.de Cc: pbonzini@redhat.com Cc: luto@amacapital.net Cc: Linus Torvalds Cc: Suresh Siddha Link: http://lkml.kernel.org/r/20150115192028.GD27332@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i387.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 5e275d31802e..6eb6fcb83f63 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -51,6 +51,10 @@ static inline void kernel_fpu_end(void) preempt_enable(); } +/* Must be called with preempt disabled */ +extern void kernel_fpu_disable(void); +extern void kernel_fpu_enable(void); + /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions -- cgit From 54750f2cf042c42b4223d67b1bd20138464bde0e Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 20 Jan 2015 15:54:52 -0200 Subject: KVM: x86: workaround SuSE's 2.6.16 pvclock vs masterclock issue SuSE's 2.6.16 kernel fails to boot if the delta between tsc_timestamp and rdtsc is larger than a given threshold: * If we get more than the below threshold into the future, we rerequest * the real time from the host again which has only little offset then * that we need to adjust using the TSC. * * For now that threshold is 1/5th of a jiffie. That should be good * enough accuracy for completely broken systems, but also give us swing * to not call out to the host all the time. */ #define PVCLOCK_DELTA_MAX ((1000000000ULL / HZ) / 5) Disable masterclock support (which increases said delta) in case the boot vcpu does not use MSR_KVM_SYSTEM_TIME_NEW. Upstreams kernels which support pvclock vsyscalls (and therefore make use of PVCLOCK_STABLE_BIT) use MSR_KVM_SYSTEM_TIME_NEW. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 97a5dd0222c8..177b2f2ff9fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -627,6 +627,8 @@ struct kvm_arch { #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif + + bool boot_vcpu_runs_old_kvmclock; }; struct kvm_vm_stat { -- cgit From cfaa790a3fb8a7efa98f4a6457e19dc3a0db35d3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 15 Jan 2015 09:44:56 +0100 Subject: kvm: Fix CR3_PCID_INVD type on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kvm/emulate.c: In function ‘check_cr_write’: arch/x86/kvm/emulate.c:3552:4: warning: left shift count >= width of type rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; happens because sizeof(UL) on 32-bit is 4 bytes but we shift it 63 bits to the left. Signed-off-by: Borislav Petkov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 177b2f2ff9fb..4327af53e544 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -51,7 +51,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL -#define CR3_PCID_INVD (1UL << 63) +#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ -- cgit From 8d80696060eedf49c080c0f2cf39a20ae7e787f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:09 +0000 Subject: x86/apic: Avoid open coded x2apic detection enable_IR_x2apic() grew a open coded x2apic detection. Implement a proper helper function which shares the code with the already existing x2apic_enabled(). Made it use rdmsrl_safe as suggested by Boris. Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Link: http://lkml.kernel.org/r/20150115211702.285038186@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 465b309af254..1a8ba26c2fbd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -108,6 +108,15 @@ extern u64 native_apic_icr_read(void); extern int x2apic_mode; +static inline bool apic_is_x2apic_enabled(void) +{ + u64 msr; + + if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) + return false; + return msr & X2APIC_ENABLE; +} + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -175,15 +184,7 @@ extern void check_x2apic(void); extern void enable_x2apic(void); static inline int x2apic_enabled(void) { - u64 msr; - - if (!cpu_has_x2apic) - return 0; - - rdmsrl(MSR_IA32_APICBASE, msr); - if (msr & X2APIC_ENABLE) - return 1; - return 0; + return cpu_has_x2apic && apic_is_x2apic_enabled(); } #define x2apic_supported() (cpu_has_x2apic) -- cgit From 81a46dd8249d7fa72a8557e58a38aa984e6b5e16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:11 +0000 Subject: x86/apic: Make x2apic_mode depend on CONFIG_X86_X2APIC No point in having a static variable around which is always 0. Let the compiler optimize code out if disabled. Signed-off-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Link: http://lkml.kernel.org/r/20150115211702.363274310@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1a8ba26c2fbd..d2225fdc953e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -106,8 +106,6 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -extern int x2apic_mode; - static inline bool apic_is_x2apic_enabled(void) { u64 msr; @@ -178,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void) return val; } +extern int x2apic_mode; extern int x2apic_phys; extern int x2apic_preenabled; extern void check_x2apic(void); @@ -210,8 +209,9 @@ static inline void x2apic_force_phys(void) { } -#define x2apic_preenabled 0 -#define x2apic_supported() 0 +#define x2apic_mode (0) +#define x2apic_preenabled (0) +#define x2apic_supported() (0) #endif extern void enable_IR_x2apic(void); -- cgit From 2ca5b40479246087695d9e6343075b47ee6887ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:14 +0000 Subject: x86/ioapic: Check x2apic really The x2apic_preenabled flag is just a horrible hack and if X2APIC support is disabled it does not reflect the actual hardware state. Check the hardware instead. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211702.541280622@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d2225fdc953e..392bbcf35471 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -178,7 +178,6 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; -extern int x2apic_preenabled; extern void check_x2apic(void); extern void enable_x2apic(void); static inline int x2apic_enabled(void) @@ -210,7 +209,6 @@ static inline void x2apic_force_phys(void) } #define x2apic_mode (0) -#define x2apic_preenabled (0) #define x2apic_supported() (0) #endif -- cgit From d524165cb8dbb2ce5916cd7682236b9324ae2644 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:17 +0000 Subject: x86/apic: Check x2apic early No point in delaying the x2apic detection for the CONFIG_X86_X2APIC=n case to enable_IR_x2apic(). We rather detect that before we try to setup anything there. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211702.702479404@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 392bbcf35471..ca8deb484d03 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -178,7 +178,7 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; -extern void check_x2apic(void); +extern void __init check_x2apic(void); extern void enable_x2apic(void); static inline int x2apic_enabled(void) { -- cgit From 55eae7de727e9ecc814853ec364fbbb352c337df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:19 +0000 Subject: x86/x2apic: Move code in conditional region No point in having try_to_enable_x2apic() outside of the CONFIG_X86_X2APIC section and having inline functions and more ifdefs to deal with it. Move the code into the existing ifdef section and remove the inline cruft. Fixup the printk about not enabling interrupt remapping as suggested by Boris. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211702.795388613@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ca8deb484d03..951caa17d8ba 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -186,27 +186,11 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) -static inline void x2apic_force_phys(void) -{ - x2apic_phys = 1; -} #else -static inline void disable_x2apic(void) -{ -} -static inline void check_x2apic(void) -{ -} -static inline void enable_x2apic(void) -{ -} -static inline int x2apic_enabled(void) -{ - return 0; -} -static inline void x2apic_force_phys(void) -{ -} +static inline void disable_x2apic(void) { } +static inline void check_x2apic(void) { } +static inline void enable_x2apic(void) { } +static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) -- cgit From 44e25ff9e6912347a1a54c757fc75681d0dc42d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:24 +0000 Subject: x86/x2apic: Disable x2apic from nox2apic setup There is no point in postponing the hardware disablement of x2apic. It can be disabled right away in the nox2apic setup function. Disable it right away and set the state to DISABLED . This allows to remove all the nox2apic conditionals all over the place. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211703.051214090@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 951caa17d8ba..5d7488e9b66e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -187,7 +187,6 @@ static inline int x2apic_enabled(void) #define x2apic_supported() (cpu_has_x2apic) #else -static inline void disable_x2apic(void) { } static inline void check_x2apic(void) { } static inline void enable_x2apic(void) { } static inline int x2apic_enabled(void) { return 0; } -- cgit From 659006bf3ae37a08706907ce1a36ddf57c9131d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:26 +0000 Subject: x86/x2apic: Split enable and setup function enable_x2apic() is a convoluted unreadable mess because it is used for both enablement in early boot and for setup in cpu_init(). Split the code into x2apic_enable() for enablement and x2apic_setup() for setup of (secondary cpus). Make use of the new state tracking to simplify the logic. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211703.129287153@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5d7488e9b66e..ac60c603f8dd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -179,7 +179,7 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; extern void __init check_x2apic(void); -extern void enable_x2apic(void); +extern void x2apic_setup(void); static inline int x2apic_enabled(void) { return cpu_has_x2apic && apic_is_x2apic_enabled(); @@ -188,7 +188,7 @@ static inline int x2apic_enabled(void) #define x2apic_supported() (cpu_has_x2apic) #else static inline void check_x2apic(void) { } -static inline void enable_x2apic(void) { } +static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) -- cgit From f77aa308e5a6144a47311ad6905a1a72bc0014f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:29 +0000 Subject: x86/smpboot: Move smpboot inlines to code No point for a separate header file. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211703.304126687@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/smpboot_hooks.h | 68 ------------------------------------ 1 file changed, 68 deletions(-) delete mode 100644 arch/x86/include/asm/smpboot_hooks.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h deleted file mode 100644 index 0da7409f0bec..000000000000 --- a/arch/x86/include/asm/smpboot_hooks.h +++ /dev/null @@ -1,68 +0,0 @@ -/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws - * which needs to alter them. */ - -static inline void smpboot_clear_io_apic_irqs(void) -{ -#ifdef CONFIG_X86_IO_APIC - io_apic_irqs = 0; -#endif -} - -static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) -{ - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; - pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; - pr_debug("3.\n"); -} - -static inline void smpboot_restore_warm_reset_vector(void) -{ - unsigned long flags; - - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - - *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; -} - -static inline void __init smpboot_setup_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else { - nr_ioapics = 0; - } -#endif -} - -static inline void smpboot_clear_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - nr_ioapics = 0; -#endif -} -- cgit From 8686608336e11276d72d020cb0b67bee70d9a5cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:30 +0000 Subject: x86/ioapic: Provide stub functions for IOAPIC%3Dn To avoid lots of ifdeffery provide proper stubs for setup_IO_APIC(), enable_IO_APIC() and setup_ioapic_dest(). Signed-off-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Link: http://lkml.kernel.org/r/20150115211703.397170414@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index bf006cce9418..2f91685fe1cd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -279,6 +279,11 @@ static inline void disable_ioapic_support(void) { } #define native_ioapic_set_affinity NULL #define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL + +static inline void setup_IO_APIC(void) { } +static inline void enable_IO_APIC(void) { } +static inline void setup_ioapic_dest(void) { } + #endif #endif /* _ASM_X86_IO_APIC_H */ -- cgit From 05f7e46d2aac359b6bcfc06b302bdd03ca0bcada Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:40 +0000 Subject: x86/smpboot: Move apic init code to apic.c We better provide proper functions which implement the required code flow in the apic code rather than letting the smpboot code open code it. That allows to make more functions static and confines the APIC functionality to apic.c where it belongs. Signed-off-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Link: http://lkml.kernel.org/r/20150115211703.907616730@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ac60c603f8dd..92f34042be85 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -201,7 +201,6 @@ extern int get_physical_broadcast(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); -extern void connect_bsp_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); @@ -209,8 +208,6 @@ extern int verify_local_APIC(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); -extern void end_local_APIC_setup(void); -extern void bsp_end_local_APIC_setup(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); @@ -218,6 +215,9 @@ extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern int apic_force_enable(unsigned long addr); +extern int apic_bsp_setup(void); +extern void apic_ap_setup(void); + /* * On 32bit this is mach-xxx local */ -- cgit From 374aab339f10f0510cec0e79d752d31d84b08aa2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jan 2015 21:22:44 +0000 Subject: x86/apic: Reuse apic_bsp_setup() for UP APIC setup Extend apic_bsp_setup() so the same code flow can be used for APIC_init_uniprocessor(). Folded Jiangs fix to provide proper ordering of the UP setup. Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: Joerg Roedel Cc: Tony Luck Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20150115211704.084765674@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 92f34042be85..92003f3c8a42 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -215,7 +215,7 @@ extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern int apic_force_enable(unsigned long addr); -extern int apic_bsp_setup(void); +extern int apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); /* -- cgit From 801806d956c2c198b9fdd3afd156a536f9a3a139 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 26 Jan 2015 09:32:23 +0200 Subject: KVM: x86: IRET emulation does not clear NMI masking The IRET instruction should clear NMI masking, but the current implementation does not do so. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index eb181178fe0b..57a9d94fe160 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -208,6 +208,7 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); }; typedef u32 __attribute__((vector_size(16))) sse128_t; -- cgit From 853d0289340026b30f93fd0e768340221d4e605c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 5 Jan 2015 14:13:41 +0000 Subject: xen/grant-table: pre-populate kernel unmap ops for xen_gnttab_unmap_refs() When unmapping grants, instead of converting the kernel map ops to unmap ops on the fly, pre-populate the set of unmap ops. This allows the grant unmap for the kernel mappings to be trivially batched in the future. Signed-off-by: David Vrabel Reviewed-by: Stefano Stabellini --- arch/x86/include/asm/xen/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 5eea09915a15..e9f52fe2d56a 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -55,7 +55,7 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); -- cgit From 0bb599fd30108883b00c7d4a226eeb49111e6932 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 5 Jan 2015 17:06:01 +0000 Subject: xen: remove scratch frames for ballooned pages and m2p override The scratch frame mappings for ballooned pages and the m2p override are broken. Remove them in preparation for replacing them with simpler mechanisms that works. The scratch pages did not ensure that the page was not in use. In particular, the foreign page could still be in use by hardware. If the guest reused the frame the hardware could read or write that frame. The m2p override did not handle the same frame being granted by two different grant references. Trying an M2P override lookup in this case is impossible. With the m2p override removed, the grant map/unmap for the kernel mappings (for x86 PV) can be easily batched in set_foreign_p2m_mapping() and clear_foreign_p2m_mapping(). Signed-off-by: David Vrabel Reviewed-by: Stefano Stabellini --- arch/x86/include/asm/xen/page.h | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index e9f52fe2d56a..358dcd338915 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -57,7 +57,6 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); -extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); /* * Helper functions to write or read unsigned long values to/from @@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); - if (__pfn_to_mfn(pfn) != mfn) { - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. - * - * m2p_find_override_pfn returns ~0 if it doesn't find anything. - */ - pfn = m2p_find_override_pfn(mfn, ~0); - } + if (__pfn_to_mfn(pfn) != mfn) + pfn = ~0; /* - * pfn is ~0 if there are no entries in the m2p for mfn or if the - * entry doesn't map back to the mfn and m2p_override doesn't have a - * valid entry for it. + * pfn is ~0 if there are no entries in the m2p for mfn or the + * entry doesn't map back to the mfn. */ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) pfn = mfn; -- cgit From f4b4b1808690c37c7c703d43789c1988c5e7fdeb Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 28 Jan 2015 10:54:24 +0800 Subject: KVM: MMU: Add mmu help functions to support PML This patch adds new mmu layer functions to clear/set D-bit for memory slot, and to write protect superpages for memory slot. In case of PML, CPU logs the dirty GPA automatically to PML buffer when CPU updates D-bit from 0 to 1, therefore we don't have to write protect 4K pages, instead, we only need to clear D-bit in order to log that GPA. For superpages, we still write protect it and let page fault code to handle dirty page logging, as we still need to split superpage to 4K pages in PML. As PML is always enabled during guest's lifetime, to eliminate unnecessary PML GPA logging, we set D-bit manually for the slot with dirty logging disabled. Signed-off-by: Kai Huang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 843bea0e70fd..4f6369b6f7d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -835,6 +835,15 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); -- cgit From 1c91cad42366ce0799ca17e7ad6995418741d012 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 28 Jan 2015 10:54:26 +0800 Subject: KVM: x86: Change parameter of kvm_mmu_slot_remove_write_access This patch changes the second parameter of kvm_mmu_slot_remove_write_access from 'slot id' to 'struct kvm_memory_slot *' to align with kvm_x86_ops dirty logging hooks, which will be introduced in further patch. Better way is to change second parameter of kvm_arch_commit_memory_region from 'struct kvm_userspace_memory_region *' to 'struct kvm_memory_slot * new', but it requires changes on other non-x86 ARCH too, so avoid it now. Signed-off-by: Kai Huang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f6369b6f7d2..67a98d793bf2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -834,7 +834,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, -- cgit From 88178fd4f7187bbe290c5d373fd44aabec891934 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 28 Jan 2015 10:54:27 +0800 Subject: KVM: x86: Add new dirty logging kvm_x86_ops for PML This patch adds new kvm_x86_ops dirty logging hooks to enable/disable dirty logging for particular memory slot, and to flush potentially logged dirty GPAs before reporting slot->dirty_bitmap to userspace. kvm x86 common code calls these hooks when they are available so PML logic can be hidden to VMX specific. SVM won't be impacted as these hooks remain NULL there. Signed-off-by: Kai Huang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 67a98d793bf2..57916ecb9b92 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -802,6 +802,31 @@ struct kvm_x86_ops { int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* + * Arch-specific dirty logging hooks. These hooks are only supposed to + * be valid if the specific arch has hardware-accelerated dirty logging + * mechanism. Currently only for PML on VMX. + * + * - slot_enable_log_dirty: + * called when enabling log dirty mode for the slot. + * - slot_disable_log_dirty: + * called when disabling log dirty mode for the slot. + * also called when slot is created with log dirty disabled. + * - flush_log_dirty: + * called before reporting dirty_bitmap to userspace. + * - enable_log_dirty_pt_masked: + * called when reenabling log dirty for the GFNs in the mask after + * corresponding bits are cleared in slot->dirty_bitmap. + */ + void (*slot_enable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*slot_disable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*flush_log_dirty)(struct kvm *kvm); + void (*enable_log_dirty_pt_masked)(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t offset, unsigned long mask); }; struct kvm_arch_async_pf { -- cgit From 843e4330573cc5261ae260ce0b83dc570d8cdc05 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 28 Jan 2015 10:54:28 +0800 Subject: KVM: VMX: Add PML support in VMX This patch adds PML support in VMX. A new module parameter 'enable_pml' is added to allow user to enable/disable it manually. Signed-off-by: Kai Huang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 45afaee9555c..da772edd19ab 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -121,6 +122,7 @@ enum vmcs_field { GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -140,6 +142,8 @@ enum vmcs_field { VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, -- cgit From 2e6d015799d523dcce11c7d1465e6feb7b69fab1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 2 Feb 2015 15:26:09 -0200 Subject: KVM: x86: revert "add method to test PIR bitmap vector" Revert 7c6a98dfa1ba9dc64a62e73624ecea9995736bbd, given that testing PIR is not necessary anymore. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 57916ecb9b92..9dbc7435cbc2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -767,7 +767,6 @@ struct kvm_x86_ops { void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); - bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); -- cgit From 874e52086f9f1b9f9bdfbf98cca8506b110b63ba Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Jan 2015 15:17:22 +0200 Subject: x86, mrst: remove Moorestown specific serial drivers Intel Moorestown platform support was removed few years ago. This is a follow up which removes Moorestown specific code for the serial devices. It includes mrst_max3110 and earlyprintk bits. This was used on SFI (Medfield, Clovertrail) based platforms as well, though new ones use normal serial interface for the console service. Signed-off-by: Andy Shevchenko Acked-by: David Cohen Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/intel-mid.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index e34e097b6f9d..705d35708a50 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern struct console early_mrst_console; -extern void mrst_early_console_init(void); - extern struct console early_hsu_console; extern void hsu_early_console_init(const char *); -- cgit From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Feb 2015 16:45:18 -0600 Subject: livepatch: rename config to CONFIG_LIVEPATCH Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of the config and the code more consistent. Signed-off-by: Josh Poimboeuf Reviewed-by: Jingoo Han Signed-off-by: Jiri Kosina --- arch/x86/include/asm/livepatch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 26e58134c8cb..a455a53d789a 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -24,7 +24,7 @@ #include #include -#ifdef CONFIG_LIVE_PATCHING +#ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) { #ifndef CC_USING_FENTRY @@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } #else -#error Live patching support is disabled; check CONFIG_LIVE_PATCHING +#error Live patching support is disabled; check CONFIG_LIVEPATCH #endif #endif /* _ASM_X86_LIVEPATCH_H */ -- cgit From 375074cc736ab1d89a708c0a8d7baa4a70d5d476 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:07 -0700 Subject: x86: Clean up cr4 manipulation CR4 manipulation was split, seemingly at random, between direct (write_cr4) and using a helper (set/clear_in_cr4). Unfortunately, the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code, which only a small subset of users actually wanted. This patch replaces all cr4 access in functions that don't leave cr4 exactly the way they found it with new helpers cr4_set_bits, cr4_clear_bits, and cr4_set_bits_and_update_boot. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Kees Cook Cc: Linus Torvalds Link: http://lkml.kernel.org/r/495a10bdc9e67016b8fd3945700d46cfd5c12c2f.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 33 --------------------------------- arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/virtext.h | 3 ++- 3 files changed, 39 insertions(+), 34 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a092a0cce0b7..ec1c93588cef 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; -extern u32 *trampoline_cr4_features; - -static inline void set_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features |= mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 |= mask; - write_cr4(cr4); -} - -static inline void clear_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features &= ~mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 &= ~mask; - write_cr4(cr4); -} - typedef struct { unsigned long seg; } mm_segment_t; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 04905bfc508b..fc0c4bc356ce 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -15,6 +15,43 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. This should only be used + * during boot on the boot cpu. + */ +extern unsigned long mmu_cr4_features; +extern u32 *trampoline_cr4_features; + +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static inline void __native_flush_tlb(void) { native_write_cr3(native_read_cr3()); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 5da71c27cc59..f41e19ca717b 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -19,6 +19,7 @@ #include #include +#include /* * VMX functions: @@ -40,7 +41,7 @@ static inline int cpu_has_vmx(void) static inline void cpu_vmxoff(void) { asm volatile (ASM_VMX_VMXOFF : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static inline int cpu_vmx_enabled(void) -- cgit From 1e02ce4cccdcb9688386e5b8d2c9fa4660b45389 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:08 -0700 Subject: x86: Store a per-cpu shadow copy of CR4 Context switches and TLB flushes can change individual bits of CR4. CR4 reads take several cycles, so store a shadow copy of CR4 in a per-cpu variable. To avoid wasting a cache line, I added the CR4 shadow to cpu_tlbstate, which is already touched in switch_mm. The heaviest users of the cr4 shadow will be switch_mm and __switch_to_xtra, and __switch_to_xtra is called shortly after switch_mm during context switch, so the cacheline is likely to be hot. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Kees Cook Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/3a54dd3353fffbf84804398e00dfdc5b7c1afd7d.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 6 ++--- arch/x86/include/asm/special_insns.h | 6 ++--- arch/x86/include/asm/tlbflush.h | 52 +++++++++++++++++++++++++++--------- arch/x86/include/asm/virtext.h | 2 +- 4 files changed, 46 insertions(+), 20 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 32444ae939ca..965c47d254aa 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_cpu_ops.write_cr4, x); } diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index e820c080a4e9..6a4b00fafb00 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x) native_write_cr3(x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return native_read_cr4(); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return native_read_cr4_safe(); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fc0c4bc356ce..cd791948b286 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -15,14 +15,37 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +struct tlb_state { +#ifdef CONFIG_SMP + struct mm_struct *active_mm; + int state; +#endif + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by + * disabling interrupts when modifying either one. + */ + unsigned long cr4; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +/* Initialize cr4 shadow for this CPU. */ +static inline void cr4_init_shadow(void) +{ + this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); +} + /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { unsigned long cr4; - cr4 = read_cr4(); - cr4 |= mask; - write_cr4(cr4); + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 | mask) != cr4) { + cr4 |= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } } /* Clear in this cpu's CR4. */ @@ -30,9 +53,18 @@ static inline void cr4_clear_bits(unsigned long mask) { unsigned long cr4; - cr4 = read_cr4(); - cr4 &= ~mask; - write_cr4(cr4); + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 & ~mask) != cr4) { + cr4 &= ~mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } +} + +/* Read the CR4 shadow. */ +static inline unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); } /* @@ -61,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void) { unsigned long cr4; - cr4 = native_read_cr4(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); /* clear PGE */ native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ @@ -221,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -struct tlb_state { - struct mm_struct *active_mm; - int state; -}; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); - static inline void reset_lazy_tlbstate(void) { this_cpu_write(cpu_tlbstate.state, 0); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index f41e19ca717b..cce9ee68e335 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -46,7 +46,7 @@ static inline void cpu_vmxoff(void) static inline int cpu_vmx_enabled(void) { - return read_cr4() & X86_CR4_VMXE; + return __read_cr4() & X86_CR4_VMXE; } /** Disable VMX if it is enabled on the current CPU -- cgit From 22c4bd9fa921c2b1b3f2420d7b9dabbe982f3059 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:09 -0700 Subject: x86: Add a comment clarifying LDT context switching The code is correct, but only for a rather subtle reason. This confused me for quite a while when I read switch_mm, so clarify the code to avoid confusing other people, too. TBH, I wouldn't be surprised if this code was only correct by accident. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Kees Cook Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/0db86397f968996fb772c443c251415b0b430ddd.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4b75d591eb5e..52c18359f1dc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -55,12 +55,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* * Load the LDT, if the LDT is different. * - * It's possible leave_mm(prev) has been called. If so, - * then prev->context.ldt could be out of sync with the - * LDT descriptor or the LDT register. This can only happen - * if prev->context.ldt is non-null, since we never free - * an LDT. But LDTs can't be shared across mms, so - * prev->context.ldt won't be equal to next->context.ldt. + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never free an LDT while the mm still exists. That + * means that next->context.ldt != prev->context.ldt, + * because mms never share an LDT. */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); -- cgit From 7911d3f7af14a614617e38245fedf98a724e46a9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:12 -0700 Subject: perf/x86: Only allow rdpmc if a perf_event is mapped We currently allow any process to use rdpmc. This significantly weakens the protection offered by PR_TSC_DISABLED, and it could be helpful to users attempting to exploit timing attacks. Since we can't enable access to individual counters, use a very coarse heuristic to limit access to rdpmc: allow access only when a perf_event is mmapped. This protects seccomp sandboxes. There is plenty of room to further tighen these restrictions. For example, this allows rdpmc for any x86_pmu event, but it's only useful for self-monitoring tasks. As a side effect, cap_user_rdpmc will now be false for AMD uncore events. This isn't a real regression, since .event_idx is disabled for these events anyway for the time being. Whenever that gets re-added, the cap_user_rdpmc code can be adjusted or refactored accordingly. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Kees Cook Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Linus Torvalds Link: http://lkml.kernel.org/r/a2bdb3cf3a1d70c26980d7c6dddfbaa69f3182bf.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu.h | 2 ++ arch/x86/include/asm/mmu_context.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 876e74e8eec7..09b9620a73b4 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,6 +19,8 @@ typedef struct { struct mutex lock; void __user *vdso; + + atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 52c18359f1dc..89c1fece224e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -18,6 +18,18 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, } #endif /* !CONFIG_PARAVIRT */ +#ifdef CONFIG_PERF_EVENTS +static inline void load_mm_cr4(struct mm_struct *mm) +{ + if (atomic_read(&mm->context.perf_rdpmc_allowed)) + cr4_set_bits(X86_CR4_PCE); + else + cr4_clear_bits(X86_CR4_PCE); +} +#else +static inline void load_mm_cr4(struct mm_struct *mm) {} +#endif + /* * Used for LDT copy/destruction. */ @@ -52,6 +64,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); + /* Load per-mm CR4 state */ + load_mm_cr4(next); + /* * Load the LDT, if the LDT is different. * @@ -87,6 +102,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); load_LDT_nolock(&next->context); } } -- cgit From a66734297f78707ce39d756b656bfae861d53f62 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:13 -0700 Subject: perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks While perfmon2 is a sufficiently evil library (it pokes MSRs directly) that breaking it is fair game, it's still useful, so we might as well try to support it. This allows users to write 2 to /sys/devices/cpu/rdpmc to disable all rdpmc protection so that hack like perfmon2 can continue to work. At some point, if perf_event becomes fast enough to replace perfmon2, then this can go. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Kees Cook Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Linus Torvalds Link: http://lkml.kernel.org/r/caac3c1c707dcca48ecbc35f4def21495856f479.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 89c1fece224e..883f6b933fa4 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -19,9 +19,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, #endif /* !CONFIG_PARAVIRT */ #ifdef CONFIG_PERF_EVENTS +extern struct static_key rdpmc_always_available; + static inline void load_mm_cr4(struct mm_struct *mm) { - if (atomic_read(&mm->context.perf_rdpmc_allowed)) + if (static_key_true(&rdpmc_always_available) || + atomic_read(&mm->context.perf_rdpmc_allowed)) cr4_set_bits(X86_CR4_PCE); else cr4_clear_bits(X86_CR4_PCE); -- cgit From 1c2b364b225a5a93dbd1f317bd000d2fec2694be Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Thu, 5 Feb 2015 17:22:26 +0800 Subject: kvm: remove KVM_MMIO_SIZE After f78146b0f923, "KVM: Fix page-crossing MMIO", and 87da7e66a405, "KVM: x86: fix vcpu->mmio_fragments overflow", actually KVM_MMIO_SIZE is gone. Signed-off-by: Tiejun Chen Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9dbc7435cbc2..848947ac6ade 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,8 +38,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_MMIO_SIZE 16 - #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 -- cgit From b4b55cda587442477a3a9f0669e26bba4b7800c0 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 5 Feb 2015 13:44:47 +0800 Subject: x86/PCI: Refine the way to release PCI IRQ resources Some PCI device drivers assume that pci_dev->irq won't change after calling pci_disable_device() and pci_enable_device() during suspend and resume. Commit c03b3b0738a5 ("x86, irq, mpparse: Release IOAPIC pin when PCI device is disabled") frees PCI IRQ resources when pci_disable_device() is called and reallocate IRQ resources when pci_enable_device() is called again. This breaks above assumption. So commit 3eec595235c1 ("x86, irq, PCI: Keep IRQ assignment for PCI devices during suspend/hibernation") and 9eabc99a635a ("x86, irq, PCI: Keep IRQ assignment for runtime power management") fix the issue by avoiding freeing/reallocating IRQ resources during PCI device suspend/resume. They achieve this by checking dev.power.is_prepared and dev.power.runtime_status. PM maintainer, Rafael, then pointed out that it's really an ugly fix which leaking PM internal state information to IRQ subsystem. Recently David Vrabel also reports an regression in pciback driver caused by commit cffe0a2b5a34 ("x86, irq: Keep balance of IOAPIC pin reference count"). Please refer to: http://lkml.org/lkml/2015/1/14/546 So this patch refine the way to release PCI IRQ resources. Instead of releasing PCI IRQ resources in pci_disable_device()/ pcibios_disable_device(), we now release it at driver unbinding notification BUS_NOTIFY_UNBOUND_DRIVER. In other word, we only release PCI IRQ resources when there's no driver bound to the PCI device, and it keeps the assumption that pci_dev->irq won't through multiple invocation of pci_enable_device()/pci_disable_device(). Signed-off-by: Jiang Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/pci_x86.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8d3c3d..fa1195dae425 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); - struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); -- cgit From f7819512996361280b86259222456fcf15aad926 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 4 Feb 2015 18:20:58 +0100 Subject: kvm: add halt_poll_ns module parameter This patch introduces a new module parameter for the KVM module; when it is present, KVM attempts a bit of polling on every HLT before scheduling itself out via kvm_vcpu_block. This parameter helps a lot for latency-bound workloads---in particular I tested it with O_DSYNC writes with a battery-backed disk in the host. In this case, writes are fast (because the data doesn't have to go all the way to the platters) but they cannot be merged by either the host or the guest. KVM's performance here is usually around 30% of bare metal, or 50% if you use cache=directsync or cache=writethrough (these parameters avoid that the guest sends pointless flush requests, and at the same time they are not slow because of the battery-backed cache). The bad performance happens because on every halt the host CPU decides to halt itself too. When the interrupt comes, the vCPU thread is then migrated to a new physical CPU, and in general the latency is horrible because the vCPU thread has to be scheduled back in. With this patch performance reaches 60-65% of bare metal and, more important, 99% of what you get if you use idle=poll in the guest. This means that the tunable gets rid of this particular bottleneck, and more work can be done to improve performance in the kernel or QEMU. Of course there is some price to pay; every time an otherwise idle vCPUs is interrupted by an interrupt, it will poll unnecessarily and thus impose a little load on the host. The above results were obtained with a mostly random value of the parameter (500000), and the load was around 1.5-2.5% CPU usage on one of the host's core for each idle guest vCPU. The patch also adds a new stat, /sys/kernel/debug/kvm/halt_successful_poll, that can be used to tune the parameter. It counts how many HLT instructions received an interrupt during the polling period; each successful poll avoids that Linux schedules the VCPU thread out and back in, and may also avoid a likely trip to C1 and back for the physical CPU. While the VM is idle, a Linux 4 VCPU VM halts around 10 times per second. Of these halts, almost all are failed polls. During the benchmark, instead, basically all halts end within the polling period, except a more or less constant stream of 50 per second coming from vCPUs that are not running the benchmark. The wasted time is thus very low. Things may be slightly different for Windows VMs, which have a ~10 ms timer tick. The effect is also visible on Marcelo's recently-introduced latency test for the TSC deadline timer. Though of course a non-RT kernel has awful latency bounds, the latency of the timer is around 8000-10000 clock cycles compared to 20000-120000 without setting halt_poll_ns. For the TSC deadline timer, thus, the effect is both a smaller average latency and a smaller variance. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 848947ac6ade..a236e39cc385 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -655,6 +655,7 @@ struct kvm_vcpu_stat { u32 irq_window_exits; u32 nmi_window_exits; u32 halt_exits; + u32 halt_successful_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; -- cgit From 0a191362058391878cc2a4d4ccddcd8223eb4f79 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:11:22 -0800 Subject: x86: drop _PAGE_FILE and pte_file()-related helpers We've replaced remap_file_pages(2) implementation with emulation. Nobody creates non-linear mapping anymore. Signed-off-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 38 +---------------------------------- arch/x86/include/asm/pgtable-3level.h | 12 ----------- arch/x86/include/asm/pgtable.h | 20 ------------------ arch/x86/include/asm/pgtable_64.h | 6 +----- arch/x86/include/asm/pgtable_types.h | 3 --- 5 files changed, 2 insertions(+), 77 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 206a87fdd22d..fd74a11959de 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range. - */ -#define PTE_FILE_MAX_BITS 29 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + - _PAGE_FILE, - }; -} - /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 81bb91b49a88..cdaa58c9b39e 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -/* - * Bits 0, 6 and 7 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - * - * For soft-dirty tracking 11 bit is taken from - * the low part of pte as well. - */ -#define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) \ - ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) -#define PTE_FILE_MAX_BITS 32 - /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e8a5454acc99..0fe03f834fb1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte) return pte_flags(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) -{ - return pte_flags(pte) & _PAGE_FILE; -} - static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; @@ -329,21 +324,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SOFT_DIRTY; -} - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4572b2f30237..e227970f983e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* PUD - Level3 access */ /* PMD - Level 2 access */ -#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ - _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ @@ -145,7 +141,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #ifdef CONFIG_NUMA_BALANCING /* Automatic NUMA balancing needs to be distinguishable from swap entries */ #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 25bcd4a89517..5185a4f599ec 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -38,8 +38,6 @@ /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -/* - set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) @@ -114,7 +112,6 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ -- cgit From d9bab50aa46ce46dd4537d455eb13b200cdac516 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:28:01 +1030 Subject: lguest: remove NOTIFY call and eventfd facility. Disappointing, as this was kind of neat (especially getting to use RCU to manage the address -> eventfd mapping). But now the devices are PCI handled in userspace, we get rid of both the NOTIFY hypercall and the interface to connect an eventfd. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 879fd7d33877..ef01fef3eebc 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -16,7 +16,6 @@ #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 -#define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 #define LHCALL_SEND_INTERRUPTS 19 -- cgit From d016bf7ece53b2b947bfd769e0842fd2feb7556b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:41 -0800 Subject: mm: make FIRST_USER_ADDRESS unsigned long on all archs LKP has triggered a compiler warning after my recent patch "mm: account pmd page tables to the process": mm/mmap.c: In function 'exit_mmap': >> mm/mmap.c:2857:2: warning: right shift count >= width of type [enabled by default] The code: > 2857 WARN_ON(mm_nr_pmds(mm) > 2858 round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); In this, on tile, we have FIRST_USER_ADDRESS defined as 0. round_up() has the same type -- int. PUD_SHIFT. I think the best way to fix it is to define FIRST_USER_ADDRESS as unsigned long. On every arch for consistency. Signed-off-by: Kirill A. Shutemov Reported-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 5185a4f599ec..3e0230c94cff 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -4,7 +4,7 @@ #include #include -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ -- cgit From e7bb4b6d1609cce391af1e7bc6f31d14f1a3a890 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Feb 2015 14:58:19 -0800 Subject: mm: add p[te|md] protnone helpers for use by NUMA balancing This is a preparatory patch that introduces protnone helpers for automatic NUMA balancing. Signed-off-by: Mel Gorman Acked-by: Linus Torvalds Acked-by: Aneesh Kumar K.V Tested-by: Sasha Levin Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Paul Mackerras Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0fe03f834fb1..f519b0b529dd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -483,6 +483,22 @@ static inline int pmd_present(pmd_t pmd) _PAGE_NUMA); } +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_PROTNONE; +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be -- cgit From 21d9ee3eda7792c45880b2f11bff8e95c9a061fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Feb 2015 14:58:32 -0800 Subject: mm: remove remaining references to NUMA hinting bits and helpers This patch removes the NUMA PTE bits and associated helpers. As a side-effect it increases the maximum possible swap space on x86-64. One potential source of problems is races between the marking of PTEs PROT_NONE, NUMA hinting faults and migration. It must be guaranteed that a PTE being protected is not faulted in parallel, seen as a pte_none and corrupting memory. The base case is safe but transhuge has problems in the past due to an different migration mechanism and a dependance on page lock to serialise migrations and warrants a closer look. task_work hinting update parallel fault ------------------------ -------------- change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault pmd_none do_huge_pmd_anonymous_page read? pmd_lock blocks until hinting complete, fail !pmd_none test write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none pmd_modify set_pmd_at task_work hinting update parallel migration ------------------------ ------------------ change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault do_huge_pmd_numa_page migrate_misplaced_transhuge_page pmd_lock waits for updates to complete, recheck pmd_same pmd_modify set_pmd_at Both of those are safe and the case where a transhuge page is inserted during a protection update is unchanged. The case where two processes try migrating at the same time is unchanged by this series so should still be ok. I could not find a case where we are accidentally depending on the PTE not being cleared and flushed. If one is missed, it'll manifest as corruption problems that start triggering shortly after this series is merged and only happen when NUMA balancing is enabled. Signed-off-by: Mel Gorman Tested-by: Sasha Levin Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Cc: Mark Brown Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 22 +++---------------- arch/x86/include/asm/pgtable_64.h | 5 ----- arch/x86/include/asm/pgtable_types.h | 41 ++---------------------------------- 3 files changed, 5 insertions(+), 63 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f519b0b529dd..34d42a7d5595 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -442,13 +442,6 @@ static inline int pte_same(pte_t a, pte_t b) } static inline int pte_present(pte_t a) -{ - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } @@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } #ifdef CONFIG_NUMA_BALANCING @@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e227970f983e..2ee781114d34 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #define SWP_TYPE_BITS 5 -#ifdef CONFIG_NUMA_BALANCING -/* Automatic NUMA balancing needs to be distinguishable from swap entries */ -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) -#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 3e0230c94cff..8c7c10802e9c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -27,14 +27,6 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -75,21 +67,6 @@ #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif -/* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict @@ -122,8 +99,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* * The cache modes defined here are used to translate between pure SW usage @@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -#ifdef CONFIG_NUMA_BALANCING -/* Set of bits that distinguishes present, prot_none and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) -static inline pteval_t ptenuma_flags(pte_t pte) -{ - return pte_flags(pte) & _PAGE_NUMA_MASK; -} - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_NUMA_MASK; -} -#endif /* CONFIG_NUMA_BALANCING */ - #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) -- cgit From c819f37e7e174d68cd013abf33725b4e07ced023 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Feb 2015 14:58:38 -0800 Subject: x86: mm: restore original pte_special check Commit b38af4721f59 ("x86,mm: fix pte_special versus pte_numa") adjusted the pte_special check to take into account that a special pte had SPECIAL and neither PRESENT nor PROTNONE. Now that NUMA hinting PTEs are no longer modifying _PAGE_PRESENT it should be safe to restore the original pte_special behaviour. Signed-off-by: Mel Gorman Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 34d42a7d5595..67fc3d2b0aab 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -132,13 +132,7 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - /* - * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. - * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == - * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. - */ - return (pte_flags(pte) & _PAGE_SPECIAL) && - (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); + return pte_flags(pte) & _PAGE_SPECIAL; } static inline unsigned long pte_pfn(pte_t pte) -- cgit From f56141e3e2d9aabf7e6b89680ab572c2cdbb2a24 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Feb 2015 15:01:14 -0800 Subject: all arches, signal: move restart_block to struct task_struct If an attacker can cause a controlled kernel stack overflow, overwriting the restart block is a very juicy exploit target. This is because the restart_block is held in the same memory allocation as the kernel stack. Moving the restart block to struct task_struct prevents this exploit by making the restart_block harder to locate. Note that there are other fields in thread_info that are also easy targets, at least on some architectures. It's also a decent simplification, since the restart code is more or less identical on all architectures. [james.hogan@imgtec.com: metag: align thread_info::supervisor_stack] Signed-off-by: Andy Lutomirski Cc: Thomas Gleixner Cc: Al Viro Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: David Miller Acked-by: Richard Weinberger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Steven Miao Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Richard Kuo Cc: "Luck, Tony" Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Jonas Bonn Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman (powerpc) Tested-by: Michael Ellerman (powerpc) Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chen Liqin Cc: Lennox Wu Cc: Chris Metcalf Cc: Guan Xuetao Cc: Chris Zankel Cc: Max Filippov Cc: Oleg Nesterov Cc: Guenter Roeck Signed-off-by: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/thread_info.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e82e95abc92b..1d4e4f279a32 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - struct restart_block restart_block; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -45,9 +44,6 @@ struct thread_info { .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) -- cgit From ef7f0d6a6ca8c9e4b27d78895af86c2fbfaeedb2 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:39:25 -0800 Subject: x86_64: add KASan support This patch adds arch specific code for kernel address sanitizer. 16TB of virtual addressed used for shadow memory. It's located in range [ffffec0000000000 - fffffc0000000000] between vmemmap and %esp fixup stacks. At early stage we map whole shadow region with zero page. Latter, after pages mapped to direct mapping address range we unmap zero pages from corresponding shadow (see kasan_map_shadow()) and allocate and map a real shadow memory reusing vmemmap_populate() function. Also replace __pa with __pa_nodebug before shadow initialized. __pa with CONFIG_DEBUG_VIRTUAL=y make external function call (__phys_addr) __phys_addr is instrumented, so __asan_load could be called before shadow area initialized. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Jim Davis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kasan.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 arch/x86/include/asm/kasan.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h new file mode 100644 index 000000000000..8b22422fbad8 --- /dev/null +++ b/arch/x86/include/asm/kasan.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_KASAN_H +#define _ASM_X86_KASAN_H + +/* + * Compiler uses shadow offset assuming that addresses start + * from 0. Kernel addresses don't start from 0, so shadow + * for kernel really starts from compiler's shadow offset + + * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT + */ +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ + (0xffff800000000000ULL >> 3)) +/* 47 bits for kernel address -> (47 - 3) bits for shadow */ +#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) + +#ifndef __ASSEMBLY__ + +extern pte_t kasan_zero_pte[]; +extern pte_t kasan_zero_pmd[]; +extern pte_t kasan_zero_pud[]; + +#ifdef CONFIG_KASAN +void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_init(void); +#else +static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_init(void) { } +#endif + +#endif + +#endif -- cgit From 393f203f5fd54421fddb1e2a263f64d3876eeadb Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:39:56 -0800 Subject: x86_64: kasan: add interceptors for memset/memmove/memcpy functions Recently instrumentation of builtin functions calls was removed from GCC 5.0. To check the memory accessed by such functions, userspace asan always uses interceptors for them. So now we should do this as well. This patch declares memset/memmove/memcpy as weak symbols. In mm/kasan/kasan.c we have our own implementation of those functions which checks memory before accessing it. Default memset/memmove/memcpy now now always have aliases with '__' prefix. For files that built without kasan instrumentation (e.g. mm/slub.c) original mem* replaced (via #define) with prefixed variants, cause we don't want to check memory accesses there. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/string_64.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c468fc2c..e4661196994e 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *__memcpy(void *to, const void *from, size_t len); + #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 extern void *memcpy(void *to, const void *from, size_t len); #else -extern void *__memcpy(void *to, const void *from, size_t len); #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ @@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); +void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); +void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); @@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +/* + * For files that not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ + +#undef memcpy +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ -- cgit From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:39:59 -0800 Subject: kasan: enable stack instrumentation Stack instrumentation allows to detect out of bounds memory accesses for variables allocated on stack. Compiler adds redzones around every variable on stack and poisons redzones in function's prologue. Such approach significantly increases stack usage, so all in-kernel stacks size were doubled. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/page_64_types.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 75450b2c7be4..4edd53b79a81 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,17 +1,23 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 2 +#ifdef CONFIG_KASAN +#define KASAN_STACK_ORDER 1 +#else +#define KASAN_STACK_ORDER 0 +#endif + +#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) -#define EXCEPTION_STACK_ORDER 0 +#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) -#define IRQ_STACK_ORDER 2 +#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) #define DOUBLEFAULT_STACK 1 -- cgit From d6abfdb2022368d8c6c4be3f11a06656601a6cc2 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 6 Feb 2015 16:44:11 +0530 Subject: x86/spinlocks/paravirt: Fix memory corruption on unlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paravirt spinlock clears slowpath flag after doing unlock. As explained by Linus currently it does: prev = *lock; add_smp(&lock->tickets.head, TICKET_LOCK_INC); /* add_smp() is a full mb() */ if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) __ticket_unlock_slowpath(lock, prev); which is *exactly* the kind of things you cannot do with spinlocks, because after you've done the "add_smp()" and released the spinlock for the fast-path, you can't access the spinlock any more. Exactly because a fast-path lock might come in, and release the whole data structure. Linus suggested that we should not do any writes to lock after unlock(), and we can move slowpath clearing to fastpath lock. So this patch implements the fix with: 1. Moving slowpath flag to head (Oleg): Unlocked locks don't care about the slowpath flag; therefore we can keep it set after the last unlock, and clear it again on the first (try)lock. -- this removes the write after unlock. note that keeping slowpath flag would result in unnecessary kicks. By moving the slowpath flag from the tail to the head ticket we also avoid the need to access both the head and tail tickets on unlock. 2. use xadd to avoid read/write after unlock that checks the need for unlock_kick (Linus): We further avoid the need for a read-after-release by using xadd; the prev head value will include the slowpath flag and indicate if we need to do PV kicking of suspended spinners -- on modern chips xadd isn't (much) more expensive than an add + load. Result: setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest) benchmark overcommit %improve kernbench 1x -0.13 kernbench 2x 0.02 dbench 1x -1.77 dbench 2x -0.63 [Jeremy: Hinted missing TICKET_LOCK_INC for kick] [Oleg: Moved slowpath flag to head, ticket_equals idea] [PeterZ: Added detailed changelog] Suggested-by: Linus Torvalds Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Raghavendra K T Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Jones Cc: Andrew Morton Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: Dave Jones Cc: David Vrabel Cc: Fernando Luis Vázquez Cao Cc: Konrad Rzeszutek Wilk Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Ulrich Obergfell Cc: Waiman Long Cc: a.ryabinin@samsung.com Cc: dave@stgolabs.net Cc: hpa@zytor.com Cc: jasowang@redhat.com Cc: jeremy@goop.org Cc: paul.gortmaker@windriver.com Cc: riel@redhat.com Cc: tglx@linutronix.de Cc: waiman.long@hp.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 94 ++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 48 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 625660f8a2fc..cf87de3fc390 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key); static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) { - set_bit(0, (volatile unsigned long *)&lock->tickets.tail); + set_bit(0, (volatile unsigned long *)&lock->tickets.head); } #else /* !CONFIG_PARAVIRT_SPINLOCKS */ @@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +static inline int __tickets_equal(__ticket_t one, __ticket_t two) +{ + return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); +} + +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, + __ticket_t head) +{ + if (head & TICKET_SLOWPATH_FLAG) { + arch_spinlock_t old, new; + + old.tickets.head = head; + new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; + old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; + new.tickets.tail = old.tickets.tail; + + /* try to clear slowpath flag when there are no contenders */ + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); + } +} static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { - return lock.tickets.head == lock.tickets.tail; + return __tickets_equal(lock.tickets.head, lock.tickets.tail); } /* @@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) if (likely(inc.head == inc.tail)) goto out; - inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { unsigned count = SPIN_THRESHOLD; do { - if (READ_ONCE(lock->tickets.head) == inc.tail) - goto out; + inc.head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(inc.head, inc.tail)) + goto clear_slowpath; cpu_relax(); } while (--count); __ticket_lock_spinning(lock, inc.tail); } -out: barrier(); /* make sure nothing creeps before the lock is taken */ +clear_slowpath: + __ticket_check_and_clear_slowpath(lock, inc.head); +out: + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) @@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) arch_spinlock_t old, new; old.tickets = READ_ONCE(lock->tickets); - if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) + if (!__tickets_equal(old.tickets.head, old.tickets.tail)) return 0; new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); + new.head_tail &= ~TICKET_SLOWPATH_FLAG; /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, - arch_spinlock_t old) -{ - arch_spinlock_t new; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - /* Perform the unlock on the "before" copy */ - old.tickets.head += TICKET_LOCK_INC; - - /* Clear the slowpath flag */ - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); - - /* - * If the lock is uncontended, clear the flag - use cmpxchg in - * case it changes behind our back though. - */ - if (new.tickets.head != new.tickets.tail || - cmpxchg(&lock->head_tail, old.head_tail, - new.head_tail) != old.head_tail) { - /* - * Lock still has someone queued for it, so wake up an - * appropriate waiter. - */ - __ticket_unlock_kick(lock, old.tickets.head); - } -} - static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - arch_spinlock_t prev; + static_key_false(¶virt_ticketlocks_enabled)) { + __ticket_t head; - prev = *lock; - add_smp(&lock->tickets.head, TICKET_LOCK_INC); + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - /* add_smp() is a full mb() */ + head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) - __ticket_unlock_slowpath(lock, prev); + if (unlikely(head & TICKET_SLOWPATH_FLAG)) { + head &= ~TICKET_SLOWPATH_FLAG; + __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); + } } else __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } @@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return tmp.tail != tmp.head; + return !__tickets_equal(tmp.tail, tmp.head); } static inline int arch_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; + tmp.head &= ~TICKET_SLOWPATH_FLAG; + return (tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended @@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - __ticket_t head = ACCESS_ONCE(lock->tickets.head); + __ticket_t head = READ_ONCE(lock->tickets.head); for (;;) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); /* * We need to check "unlocked" in a loop, tmp.head == head * can be false positive because of overflow. */ - if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || - tmp.head != head) + if (__tickets_equal(tmp.head, tmp.tail) || + !__tickets_equal(tmp.head, head)) break; cpu_relax(); -- cgit From b273c2c2f2d2d13dc0bfa8923d52fbaf8fa56ae8 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Mon, 2 Feb 2015 20:27:11 +0100 Subject: x86/apic: Fix the devicetree build in certain configs Without this patch: LD init/built-in.o arch/x86/built-in.o: In function `dtb_lapic_setup': kernel/devicetree.c:155: undefined reference to `apic_force_enable' Makefile:923: recipe for target 'vmlinux' failed make: *** [vmlinux] Error 1 Signed-off-by: Ricardo Ribalda Delgado Reviewed-by: Maciej W. Rozycki Cc: David Rientjes Cc: Jan Beulich Link: http://lkml.kernel.org/r/1422905231-16067-1-git-send-email-ricardo.ribalda@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 92003f3c8a42..efc3b22d896e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else extern int apic_force_enable(unsigned long addr); +#endif extern int apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); -- cgit From 28a375df16c2b6d01227541f3956568995aa5fda Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 30 Jan 2015 16:29:38 +0000 Subject: x86/intel/quark: Add Isolated Memory Regions for Quark X1000 Intel's Quark X1000 SoC contains a set of registers called Isolated Memory Regions. IMRs are accessed over the IOSF mailbox interface. IMRs are areas carved out of memory that define read/write access rights to the various system agents within the Quark system. For a given agent in the system it is possible to specify if that agent may read or write an area of memory defined by an IMR with a granularity of 1 KiB. Quark_SecureBootPRM_330234_001.pdf section 4.5 details the concept of IMRs quark-x1000-datasheet.pdf section 12.7.4 details the implementation of IMRs in silicon. eSRAM flush, CPU Snoop write-only, CPU SMM Mode, CPU non-SMM mode, RMU and PCIe Virtual Channels (VC0 and VC1) can have individual read/write access masks applied to them for a given memory region in Quark X1000. This enables IMRs to treat each memory transaction type listed above on an individual basis and to filter appropriately based on the IMR access mask for the memory region. Quark supports eight IMRs. Since all of the DMA capable SoC components in the X1000 are mapped to VC0 it is possible to define sections of memory as invalid for DMA write operations originating from Ethernet, USB, SD and any other DMA capable south-cluster component on VC0. Similarly it is possible to mark kernel memory as non-SMM mode read/write only or to mark BIOS runtime memory as SMM mode accessible only depending on the particular memory footprint on a given system. On an IMR violation Quark SoC X1000 systems are configured to reset the system, so ensuring that the IMR memory map is consistent with the EFI provided memory map is critical to ensure no IMR violations reset the system. The API for accessing IMRs is based on MTRR code but doesn't provide a /proc or /sys interface to manipulate IMRs. Defining the size and extent of IMRs is exclusively the domain of in-kernel code. Quark firmware sets up a series of locked IMRs around pieces of memory that firmware owns such as ACPI runtime data. During boot a series of unlocked IMRs are placed around items in memory to guarantee no DMA modification of those items can take place. Grub also places an unlocked IMR around the kernel boot params data structure and compressed kernel image. It is necessary for the kernel to tear down all unlocked IMRs in order to ensure that the kernel's view of memory passed via the EFI memory map is consistent with the IMR memory map. Without tearing down all unlocked IMRs on boot transitory IMRs such as those used to protect the compressed kernel image will cause IMR violations and system reboots. The IMR init code tears down all unlocked IMRs and sets a protective IMR around the kernel .text and .rodata as one contiguous block. This sanitizes the IMR memory map with respect to the EFI memory map and protects the read-only portions of the kernel from unwarranted DMA access. Tested-by: Ong, Boon Leong Signed-off-by: Bryan O'Donoghue Reviewed-by: Andy Shevchenko Reviewed-by: Darren Hart Reviewed-by: Ong, Boon Leong Cc: andy.shevchenko@gmail.com Cc: dvhart@infradead.org Link: http://lkml.kernel.org/r/1422635379-12476-2-git-send-email-pure.logic@nexus-software.ie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/imr.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 arch/x86/include/asm/imr.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h new file mode 100644 index 000000000000..cd2ce4068441 --- /dev/null +++ b/arch/x86/include/asm/imr.h @@ -0,0 +1,60 @@ +/* + * imr.h: Isolated Memory Region API + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _IMR_H +#define _IMR_H + +#include + +/* + * IMR agent access mask bits + * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register + * definitions. + */ +#define IMR_ESRAM_FLUSH BIT(31) +#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */ +#define IMR_RMU BIT(29) +#define IMR_VC1_SAI_ID3 BIT(15) +#define IMR_VC1_SAI_ID2 BIT(14) +#define IMR_VC1_SAI_ID1 BIT(13) +#define IMR_VC1_SAI_ID0 BIT(12) +#define IMR_VC0_SAI_ID3 BIT(11) +#define IMR_VC0_SAI_ID2 BIT(10) +#define IMR_VC0_SAI_ID1 BIT(9) +#define IMR_VC0_SAI_ID0 BIT(8) +#define IMR_CPU_0 BIT(1) /* SMM mode */ +#define IMR_CPU BIT(0) /* Non SMM mode */ +#define IMR_ACCESS_NONE 0 + +/* + * Read/Write access-all bits here include some reserved bits + * These are the values firmware uses and are accepted by hardware. + * The kernel defines read/write access-all in the same way as firmware + * in order to have a consistent and crisp definition across firmware, + * bootloader and kernel. + */ +#define IMR_READ_ACCESS_ALL 0xBFFFFFFF +#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF + +/* Number of IMRs provided by Quark X1000 SoC */ +#define QUARK_X1000_IMR_MAX 0x08 +#define QUARK_X1000_IMR_REGBASE 0x40 + +/* IMR alignment bits - only bits 31:10 are checked for IMR validity */ +#define IMR_ALIGN 0x400 +#define IMR_MASK (IMR_ALIGN - 1) + +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock); + +int imr_remove_range(phys_addr_t base, size_t size); + +#endif /* _IMR_H */ -- cgit From f47233c2d34f243ecdaac179c3408a39ff9216a7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 13 Feb 2015 16:04:55 +0100 Subject: x86/mm/ASLR: Propagate base load address calculation Commit: e2b32e678513 ("x86, kaslr: randomize module base load address") makes the base address for module to be unconditionally randomized in case when CONFIG_RANDOMIZE_BASE is defined and "nokaslr" option isn't present on the commandline. This is not consistent with how choose_kernel_location() decides whether it will randomize kernel load base. Namely, CONFIG_HIBERNATION disables kASLR (unless "kaslr" option is explicitly specified on kernel commandline), which makes the state space larger than what module loader is looking at. IOW CONFIG_HIBERNATION && CONFIG_RANDOMIZE_BASE is a valid config option, kASLR wouldn't be applied by default in that case, but module loader is not aware of that. Instead of fixing the logic in module.c, this patch takes more generic aproach. It introduces a new bootparam setup data_type SETUP_KASLR and uses that to pass the information whether kaslr has been applied during kernel decompression, and sets a global 'kaslr_enabled' variable accordingly, so that any kernel code (module loading, livepatching, ...) can make decisions based on its value. x86 module loader is converted to make use of this flag. Signed-off-by: Jiri Kosina Acked-by: Kees Cook Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/alpine.LNX.2.00.1502101411280.10719@pobox.suse.cz [ Always dump correct kaslr status when panicking ] Signed-off-by: Borislav Petkov --- arch/x86/include/asm/page_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..3d43ce36eaba 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -51,6 +52,8 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern bool kaslr_enabled; + static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; -- cgit From e3a1f6cac1fe20e7ac01d96c914c25726723a64e Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 19 Feb 2015 13:06:53 +0000 Subject: x86: pte_protnone() and pmd_protnone() must check entry is not present Since _PAGE_PROTNONE aliases _PAGE_GLOBAL it is only valid if _PAGE_PRESENT is clear. Make pte_protnone() and pmd_protnone() check for this. This fixes a 64-bit Xen PV guest regression introduced by 8a0516ed8b90 ("mm: convert p[te|md]_numa users to p[te|md]_protnone_numa"). Any userspace process would endlessly fault. In a 64-bit PV guest, userspace page table entries have _PAGE_GLOBAL set by the hypervisor. This meant that any fault on a present userspace entry (e.g., a write to a read-only mapping) would be misinterpreted as a NUMA hinting fault and the fault would not be correctly handled, resulting in the access endlessly faulting. Signed-off-by: David Vrabel Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 67fc3d2b0aab..a0c35bf6cb92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd) */ static inline int pte_protnone(pte_t pte) { - return pte_flags(pte) & _PAGE_PROTNONE; + return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } static inline int pmd_protnone(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PROTNONE; + return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit From 570e1aa84c376ff39809442f09c7606ddf62cfd1 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 20 Feb 2015 10:18:59 +0100 Subject: x86/mm/ASLR: Avoid PAGE_SIZE redefinition for UML subarch Commit f47233c2d34 ("x86/mm/ASLR: Propagate base load address calculation") causes PAGE_SIZE redefinition warnings for UML subarch builds. This is caused by added includes that were leftovers from previous patch versions are are not actually needed (especially page_types.h inlcude in module.c). Drop those stray includes. Reported-by: kbuild test robot Signed-off-by: Jiri Kosina Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Kees Cook Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1502201017240.28769@pobox.suse.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 3d43ce36eaba..95e11f79f123 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,7 +3,6 @@ #include #include -#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -- cgit From 06c8173eb92bbfc03a0fe8bb64315857d0badd06 Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Thu, 5 Mar 2015 13:19:22 +0100 Subject: x86/fpu/xsaves: Fix improper uses of __ex_table Commit: f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area") introduced alternative instructions for XSAVES/XRSTORS and commit: adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time") added support for the XSAVES/XRSTORS instructions at boot time. Unfortunately both failed to properly protect them against faulting: The 'xstate_fault' macro will use the closest label named '1' backward and that ends up in the .altinstr_replacement section rather than in .text. This means that the kernel will never find in the __ex_table the .text address where this instruction might fault, leading to serious problems if userspace manages to trigger the fault. Signed-off-by: Quentin Casasnovas Signed-off-by: Jamie Iles [ Improved the changelog, fixed some whitespace noise. ] Acked-by: Borislav Petkov Acked-by: Linus Torvalds Cc: Cc: Allan Xavier Cc: H. Peter Anvin Cc: Thomas Gleixner Fixes: adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time") Fixes: f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area") Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xsave.h | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 5fa9770035dc..c9a6d68b8d62 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -82,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XSAVES"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -112,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XRSTORS"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -149,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask) */ alternative_input_2( "1:"XSAVE, - "1:"XSAVEOPT, + XSAVEOPT, X86_FEATURE_XSAVEOPT, - "1:"XSAVES, + XSAVES, X86_FEATURE_XSAVES, [fx] "D" (fx), "a" (lmask), "d" (hmask) : "memory"); @@ -178,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) */ alternative_input( "1: " XRSTOR, - "1: " XRSTORS, + XRSTORS, X86_FEATURE_XSAVES, "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); -- cgit From f4c3686386393c120710dd34df2a74183ab805fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Mar 2015 09:53:10 +0100 Subject: x86/fpu: Drop_fpu() should not assume that tsk equals current drop_fpu() does clear_used_math() and usually this is correct because tsk == current. However switch_fpu_finish()->restore_fpu_checking() is called before __switch_to() updates the "current_task" variable. If it fails, we will wrongly clear the PF_USED_MATH flag of the previous task. So use clear_stopped_child_used_math() instead. Signed-off-by: Oleg Nesterov Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Suresh Siddha Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150309171041.GB11388@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 0dbc08282291..72ba21a8b5fc 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -370,7 +370,7 @@ static inline void drop_fpu(struct task_struct *tsk) preempt_disable(); tsk->thread.fpu_counter = 0; __drop_fpu(tsk); - clear_used_math(); + clear_stopped_child_used_math(tsk); preempt_enable(); } -- cgit From 69797dafe35541bfff1989c0b37c66ed785faf0e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 Mar 2015 11:06:28 +0100 Subject: Revert "x86/mm/ASLR: Propagate base load address calculation" This reverts commit: f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation") The main reason for the revert is that the new boot flag does not work at all currently, and in order to make this work, we need non-trivial changes to the x86 boot code which we didn't manage to get done in time for merging. And even if we did, they would've been too risky so instead of rushing things and break booting 4.1 on boxes left and right, we will be very strict and conservative and will take our time with this to fix and test it properly. Reported-by: Yinghai Lu Signed-off-by: Borislav Petkov Cc: Ard Biesheuvel Cc: Baoquan He Cc: H. Peter Anvin Cc: Josh Triplett Cc: Junjie Mao Cc: Kees Cook Cc: Linus Torvalds Cc: Matt Fleming Link: http://lkml.kernel.org/r/20150316100628.GD22995@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 95e11f79f123..f97fbe3abb67 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,8 +51,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern bool kaslr_enabled; - static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; -- cgit From 9e8ce4b96b781b003e3174fbbc62e1d4388c8b8f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 20 Mar 2015 14:56:19 +0100 Subject: Revert "x86/PCI: Refine the way to release PCI IRQ resources" Commit b4b55cda5874 (Refine the way to release PCI IRQ resources) introduced a regression in the PCI IRQ resource management by causing the IRQ resource of a device, established when pci_enabled_device() is called on a fully disabled device, to be released when the driver is unbound from the device, regardless of the enable_cnt. This leads to the situation that an ill-behaved driver can now make a device unusable to subsequent drivers by an imbalance in their use of pci_enable/disable_device(). That is a serious problem for secondary drivers like vfio-pci, which are innocent of the transgressions of the previous driver. Since the solution of this problem is not immediate and requires further discussion, revert commit b4b55cda5874 and the issue it was supposed to address (a bug related to xen-pciback) will be taken care of in a different way going forward. Reported-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/pci_x86.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195dae425..164e3f8d3c3d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); -- cgit From cae2a173fe94ab3a437416af6f092fae2e65837e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 6 Apr 2015 10:26:17 -0700 Subject: x86: clean up/fix 'copy_in_user()' tail zeroing The rule for 'copy_from_user()' is that it zeroes the remaining kernel buffer even when the copy fails halfway, just to make sure that we don't leave uninitialized kernel memory around. Because even if we check for errors, some kernel buffers stay around after thge copy (think page cache). However, the x86-64 logic for user copies uses a copy_user_generic() function for all the cases, that set the "zerorest" flag for any fault on the source buffer. Which meant that it didn't just try to clear the kernel buffer after a failure in copy_from_user(), it also tried to clear the destination user buffer for the "copy_in_user()" case. Not only is that pointless, it also means that the clearing code has to worry about the tail clearing taking page faults for the user buffer case. Which is just stupid, since that case shouldn't happen in the first place. Get rid of the whole "zerorest" thing entirely, and instead just check if the destination is in kernel space or not. And then just use memset() to clear the tail of the kernel buffer if necessary. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 12a26b979bf1..f2f9b39b274a 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, } unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); +copy_user_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_UACCESS_64_H */ -- cgit