summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/audit_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c6
-rw-r--r--arch/x86/kernel/e820.c4
-rw-r--r--arch/x86/kernel/early-quirks.c23
-rw-r--r--arch/x86/kernel/entry_32.S6
-rw-r--r--arch/x86/kernel/entry_64.S28
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/process_64.c101
-rw-r--r--arch/x86/kernel/tls.c45
-rw-r--r--arch/x86/kernel/xsave.c1
16 files changed, 204 insertions, 65 deletions
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index d67c4be3e8b1..3b3b9d33ac1d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,3 +1,7 @@
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
#include <asm/ucontext.h>
#include <linux/lguest.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4f9359f36bb7..fdcbb4d27c9f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,3 +1,7 @@
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
case __NR_openat:
return 3;
case __NR_execve:
+ case __NR_execveat:
return 5;
default:
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
index 639d1289b1ba..97242a9242bd 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
@@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 30790d798e6b..cc6cedb8f25d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- int n;
cpumask_t *active_mask;
struct pmu *pmu = dev_get_drvdata(dev);
@@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
else
return 0;
- n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, active_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index d64f275fe274..673f930c700f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -365,11 +365,7 @@ static void rapl_pmu_event_read(struct perf_event *event)
static ssize_t rapl_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
-
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 9762dbd9f3f7..08f3fed2b0f2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -647,11 +647,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
static ssize_t uncore_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
-
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
+ return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 49f886481615..dd2f07ae9d0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void)
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ start_pfn = min(start_pfn, MAX_DMA_PFN);
+ end_pfn = min(end_pfn, MAX_DMA_PFN);
nr_pages += end_pfn - start_pfn;
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 2e1a6853e00c..fe9f0b79a18b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -455,6 +455,23 @@ struct intel_stolen_funcs {
u32 (*base)(int num, int slot, int func, size_t size);
};
+static size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
+ gmch_ctrl &= BDW_GMCH_GMS_MASK;
+
+ if (gmch_ctrl < 0xf0)
+ return gmch_ctrl << 25; /* 32 MB units */
+ else
+ /* 4MB increments starting at 0xf0 for 4MB */
+ return (gmch_ctrl - 0xf0 + 1) << 22;
+}
+
+typedef size_t (*stolen_size_fn)(int num, int slot, int func);
+
static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
.base = i830_stolen_base,
.size = i830_stolen_size,
@@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
.size = gen8_stolen_size,
};
+static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen9_stolen_size,
+};
+
static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
.base = intel_stolen_base,
.size = chv_stolen_size,
@@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_BDW_M_IDS(&gen8_stolen_funcs),
INTEL_BDW_D_IDS(&gen8_stolen_funcs),
INTEL_CHV_IDS(&chv_stolen_funcs),
+ INTEL_SKL_IDS(&gen9_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 344b63f18d14..1cf7c97ff175 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller)
pushl %eax
pushl %ecx
pushl %edx
- movl 0xc(%esp), %edx
- lea 0x4(%ebp), %eax
+ movl 0xc(%esp), %eax
+ lea 0x4(%ebp), %edx
movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %edx
+ subl $MCOUNT_INSN_SIZE, %eax
call prepare_ftrace_return
popl %edx
popl %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c0226ab54106..90878aa38dbd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,6 +652,20 @@ ENTRY(stub_execve)
CFI_ENDPROC
END(stub_execve)
+ENTRY(stub_execveat)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call sys_execveat
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_execveat)
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve)
CFI_ENDPROC
END(stub_x32_execve)
+ENTRY(stub_x32_execveat)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call compat_sys_execveat
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execveat)
+
#endif
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f6945bef2cd1..94f643484300 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+
+ /*
+ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
+ * guest kernel works like a bare metal kernel with additional
+ * features, and paravirt_enabled is about features that are
+ * missing.
+ */
+ pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d9156ceecdff..42caaef897c8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now)
native_write_msr(msr_kvm_wall_clock, low, high);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
pvclock_read_wallclock(&wall_clock, vcpu_time, now);
- preempt_enable();
+ put_cpu();
}
static int kvm_set_wallclock(const struct timespec *now)
@@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void)
int cpu;
unsigned long tsc_khz;
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
src = &hv_clock[cpu].pvti;
tsc_khz = pvclock_tsc_khz(src);
- preempt_enable();
+ put_cpu();
return tsc_khz;
}
@@ -263,7 +261,6 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
- pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
@@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void)
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
- preempt_enable();
+ put_cpu();
return 1;
}
if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- preempt_enable();
+ put_cpu();
return ret;
}
- preempt_enable();
+ put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3ed4a68d4013..5a2c02913af3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /*
- * Reload esp0, LDT and the page table pointer:
- */
+ /* Reload esp0 and ss1. */
load_sp0(tss, next);
- /*
- * Switch DS and ES.
- * This won't pick up thread selector changes, but I guess that is ok.
- */
- savesegment(es, prev->es);
- if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
-
- savesegment(ds, prev->ds);
- if (unlikely(next->ds | prev->ds))
- loadsegment(ds, next->ds);
-
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
savesegment(fs, fsindex);
savesegment(gs, gsindex);
+ /*
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
+ */
load_TLS(next, cpu);
/*
- * Leave lazy mode, flushing any hypercalls made here.
- * This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them, and and it must
+ * be done before math_state_restore, so the TS bit is up to
+ * date.
*/
arch_end_context_switch(next_p);
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
+ */
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
/*
* Switch FS and GS.
*
- * Segment register != 0 always requires a reload. Also
- * reload when it has changed. When prev process used 64bit
- * base always reload to avoid an information leak.
+ * These are even more complicated than FS and GS: they have
+ * 64-bit bases are that controlled by arch_prctl. Those bases
+ * only differ from the values in the GDT or LDT if the selector
+ * is 0.
+ *
+ * Loading the segment register resets the hidden base part of
+ * the register to 0 or the value from the GDT / LDT. If the
+ * next base address zero, writing 0 to the segment register is
+ * much faster than using wrmsr to explicitly zero the base.
+ *
+ * The thread_struct.fs and thread_struct.gs values are 0
+ * if the fs and gs bases respectively are not overridden
+ * from the values implied by fsindex and gsindex. They
+ * are nonzero, and store the nonzero base addresses, if
+ * the bases are overridden.
+ *
+ * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
+ * be impossible.
+ *
+ * Therefore we need to reload the segment registers if either
+ * the old or new selector is nonzero, and we need to override
+ * the base address if next thread expects it to be overridden.
+ *
+ * This code is unnecessarily slow in the case where the old and
+ * new indexes are zero and the new base is nonzero -- it will
+ * unnecessarily write 0 to the selector before writing the new
+ * base address.
+ *
+ * Note: This all depends on arch_prctl being the only way that
+ * user code can override the segment base. Once wrfsbase and
+ * wrgsbase are enabled, most of this code will need to change.
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
+
/*
- * Check if the user used a selector != 0; if yes
- * clear 64bit base, since overloaded base is always
- * mapped to the Null selector
+ * If user code wrote a nonzero value to FS, then it also
+ * cleared the overridden base address.
+ *
+ * XXX: if user code wrote 0 to FS and cleared the base
+ * address itself, we won't notice and we'll incorrectly
+ * restore the prior base address next time we reschdule
+ * the process.
*/
if (fsindex)
prev->fs = 0;
}
- /* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
+
+ /* This works (and fails) the same way as fsindex above. */
if (gsindex)
prev->gs = 0;
}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index f7fec09e3e3a..3e551eee87b9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -27,6 +27,43 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ if (LDT_empty(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+#ifdef CONFIG_X86_64
+ /* The L bit makes no sense for data. */
+ if (info->lm)
+ return false;
+#endif
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -66,6 +103,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -192,6 +232,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
@@ -205,6 +246,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4c540c4719d8..0de1fae2bdf0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
return (void *)xsave + xstate_comp_offsets[feature];
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);