summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/process_64.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 09:52:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 09:52:57 -0700
commitb0c79f49c343cda8954b3322984c32f258ca4ccb (patch)
treedd823d13683b7e6b0caebcaf3964df6150aee294 /arch/x86/kernel/process_64.c
parentf213a6c84c1b4b396a0713ee33cff0e02ba8235f (diff)
parentdd88a0a0c8615417fe6b4285769b5b772de87279 (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm updates from Ingo Molnar: - Introduce the ORC unwinder, which can be enabled via CONFIG_ORC_UNWINDER=y. The ORC unwinder is a lightweight, Linux kernel specific debuginfo implementation, which aims to be DWARF done right for unwinding. Objtool is used to generate the ORC unwinder tables during build, so the data format is flexible and kernel internal: there's no dependency on debuginfo created by an external toolchain. The ORC unwinder is almost two orders of magnitude faster than the (out of tree) DWARF unwinder - which is important for perf call graph profiling. It is also significantly simpler and is coded defensively: there has not been a single ORC related kernel crash so far, even with early versions. (knock on wood!) But the main advantage is that enabling the ORC unwinder allows CONFIG_FRAME_POINTERS to be turned off - which speeds up the kernel measurably: With frame pointers disabled, GCC does not have to add frame pointer instrumentation code to every function in the kernel. The kernel's .text size decreases by about 3.2%, resulting in better cache utilization and fewer instructions executed, resulting in a broad kernel-wide speedup. Average speedup of system calls should be roughly in the 1-3% range - measurements by Mel Gorman [1] have shown a speedup of 5-10% for some function execution intense workloads. The main cost of the unwinder is that the unwinder data has to be stored in RAM: the memory cost is 2-4MB of RAM, depending on kernel config - which is a modest cost on modern x86 systems. Given how young the ORC unwinder code is it's not enabled by default - but given the performance advantages the plan is to eventually make it the default unwinder on x86. See Documentation/x86/orc-unwinder.txt for more details. - Remove lguest support: its intended role was that of a temporary proof of concept for virtualization, plus its removal will enable the reduction (removal) of the paravirt API as well, so Rusty agreed to its removal. (Juergen Gross) - Clean up and fix FSGS related functionality (Andy Lutomirski) - Clean up IO access APIs (Andy Shevchenko) - Enhance the symbol namespace (Jiri Slaby) * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (47 commits) objtool: Handle GCC stack pointer adjustment bug x86/entry/64: Use ENTRY() instead of ALIGN+GLOBAL for stub32_clone() x86/fpu/math-emu: Add ENDPROC to functions x86/boot/64: Extract efi_pe_entry() from startup_64() x86/boot/32: Extract efi_pe_entry() from startup_32() x86/lguest: Remove lguest support x86/paravirt/xen: Remove xen_patch() objtool: Fix objtool fallthrough detection with function padding x86/xen/64: Fix the reported SS and CS in SYSCALL objtool: Track DRAP separately from callee-saved registers objtool: Fix validate_branch() return codes x86: Clarify/fix no-op barriers for text_poke_bp() x86/switch_to/64: Rewrite FS/GS switching yet again to fix AMD CPUs selftests/x86/fsgsbase: Test selectors 1, 2, and 3 x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps x86/fsgsbase/64: Fully initialize FS and GS state in start_thread_common x86/asm: Fix UNWIND_HINT_REGS macro for older binutils x86/asm/32: Fix regs_get_register() on segment registers x86/xen/64: Rearrange the SYSCALL entries x86/asm/32: Remove a bunch of '& 0xffff' from pt_regs segment reads ...
Diffstat (limited to 'arch/x86/kernel/process_64.c')
-rw-r--r--arch/x86/kernel/process_64.c242
1 files changed, 135 insertions, 107 deletions
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..c85269a76511 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
}
}
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+
regs->ip = new_ip;
regs->sp = new_sp;
regs->cs = _cs;
@@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- unsigned prev_fsindex, prev_gsindex;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
switch_fpu_prepare(prev_fpu, cpu);
@@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*
* (e.g. xen_load_tls())
*/
- savesegment(fs, prev_fsindex);
- savesegment(gs, prev_gsindex);
+ save_fsgs(prev_p);
/*
* Load TLS before restoring any segments so that segment loads
@@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- /*
- * Switch FS and GS.
- *
- * These are even more complicated than DS and ES: they have
- * 64-bit bases are that controlled by arch_prctl. The bases
- * don't necessarily match the selectors, as user code can do
- * any number of things to cause them to be inconsistent.
- *
- * We don't promise to preserve the bases if the selectors are
- * nonzero. We also don't promise to preserve the base if the
- * selector is zero and the base doesn't match whatever was
- * most recently passed to ARCH_SET_FS/GS. (If/when the
- * FSGSBASE instructions are enabled, we'll need to offer
- * stronger guarantees.)
- *
- * As an invariant,
- * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
- * impossible.
- */
- if (next->fsindex) {
- /* Loading a nonzero value into FS sets the index and base. */
- loadsegment(fs, next->fsindex);
- } else {
- if (next->fsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_fsindex)
- loadsegment(fs, 0);
- wrmsrl(MSR_FS_BASE, next->fsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- */
- loadsegment(fs, __USER_DS);
- loadsegment(fs, 0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_FS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->fsbase || prev_fsindex)
- loadsegment(fs, 0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_fsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_fsindex)
- prev->fsbase = 0;
- prev->fsindex = prev_fsindex;
-
- if (next->gsindex) {
- /* Loading a nonzero value into GS sets the index and base. */
- load_gs_index(next->gsindex);
- } else {
- if (next->gsbase) {
- /* Next index is zero but next base is nonzero. */
- if (prev_gsindex)
- load_gs_index(0);
- wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
- } else {
- /* Next base and index are both zero. */
- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
- /*
- * We don't know the previous base and can't
- * find out without RDMSR. Forcibly clear it.
- *
- * This contains a pointless SWAPGS pair.
- * Fixing it would involve an explicit check
- * for Xen or a new pvop.
- */
- load_gs_index(__USER_DS);
- load_gs_index(0);
- } else {
- /*
- * If the previous index is zero and ARCH_SET_GS
- * didn't change the base, then the base is
- * also zero and we don't need to do anything.
- */
- if (prev->gsbase || prev_gsindex)
- load_gs_index(0);
- }
- }
- }
- /*
- * Save the old state and preserve the invariant.
- * NB: if prev_gsindex == 0, then we can't reliably learn the base
- * without RDMSR because Intel user code can zero it without telling
- * us and AMD user code can program any 32-bit value without telling
- * us.
- */
- if (prev_gsindex)
- prev->gsbase = 0;
- prev->gsindex = prev_gsindex;
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
switch_fpu_finish(next_fpu, cpu);