diff options
Diffstat (limited to 'arch/x86/kernel')
39 files changed, 797 insertions, 268 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5d4502c8b983..cdb1b70ddad0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o entry_$(BITS).o @@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a18fff361c7f..3d525c6124f6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { int rc, irq, trigger, polarity; + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + rc = acpi_get_override_irq(gsi, &trigger, &polarity); if (rc == 0) { trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; @@ -845,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) static int __init acpi_parse_sbf(struct acpi_table_header *table) { - struct acpi_table_boot *sb; - - sb = (struct acpi_table_boot *)table; - if (!sb) { - printk(KERN_WARNING PREFIX "Unable to map SBF\n"); - return -ENODEV; - } + struct acpi_table_boot *sb = (struct acpi_table_boot *)table; sbf_port = sb->cmos_index; /* Save CMOS port */ @@ -865,13 +864,7 @@ static struct resource *hpet_res __initdata; static int __init acpi_parse_hpet(struct acpi_table_header *table) { - struct acpi_table_hpet *hpet_tbl; - - hpet_tbl = (struct acpi_table_hpet *)table; - if (!hpet_tbl) { - printk(KERN_WARNING PREFIX "Unable to map HPET\n"); - return -ENODEV; - } + struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { printk(KERN_WARNING PREFIX "HPET timers must be located in " diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 31368207837c..d1daead5fcdd 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void) header->pmode_cr0 = read_cr0(); if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = __read_cr4(); header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b665d241efad..ad3639ae1b9b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1580,8 +1580,7 @@ static __init void try_to_enable_x2apic(int remap_mode) * under KVM */ if (max_physical_apicid > 255 || - (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) && - !hypervisor_x2apic_available())) { + !hypervisor_x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b15bffcaba6d..b5c8ff5e9dfc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/debugreg.h> #include <asm/sections.h> #include <asm/vsyscall.h> @@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep); static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) - set_in_cr4(X86_CR4_SMEP); + cr4_set_bits(X86_CR4_SMEP); } static __init int setup_disable_smap(char *arg) @@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_SMAP)) { #ifdef CONFIG_X86_SMAP - set_in_cr4(X86_CR4_SMAP); + cr4_set_bits(X86_CR4_SMAP); #else - clear_in_cr4(X86_CR4_SMAP); + cr4_clear_bits(X86_CR4_SMAP); #endif } } @@ -1295,6 +1296,12 @@ void cpu_init(void) wait_for_master_cpu(cpu); /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + + /* * Load microcode on this cpu if a valid microcode is available. * This is early microcode loading procedure. */ @@ -1313,7 +1320,7 @@ void cpu_init(void) pr_debug("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, @@ -1394,7 +1401,7 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); switch_to_new_gdt(cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c7035073dfc1..659643376dbf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, int type, char *buf) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; - int n = 0; - - if (len > 1) { - const struct cpumask *mask; - - mask = to_cpumask(this_leaf->shared_cpu_map); - n = type ? - cpulist_scnprintf(buf, len-2, mask) : - cpumask_scnprintf(buf, len-2, mask); - buf[n++] = '\n'; - buf[n] = '\0'; - } - return n; + const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); + int ret; + + if (type) + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", + cpumask_pr_args(mask)); + else + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", + cpumask_pr_args(mask)); + buf[ret++] = '\n'; + buf[ret] = '\0'; + return ret; } static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cdfed7953963..3c036cb4a370 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,6 +44,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -151,14 +152,11 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; - int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); - if (ret == NOTIFY_STOP) - return; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); mce->finished = 0; wmb(); @@ -1452,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void) bitmap_fill(all_banks, MAX_NR_BANKS); machine_check_poll(MCP_UC | m_fl, &all_banks); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index ec2663a708e4..737b0ad4e61a 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) "Intel old style machine check architecture supported.\n"); /* Enable MCE: */ - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index bd5d46a32210..44f138296fbe 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index c6826d1e8082..746e7fd08aad 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, struct microcode_header_intel mc_header; unsigned int mc_size; + if (leftover < sizeof(mc_header)) { + pr_err("error! Truncated header in microcode data file\n"); + break; + } + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) break; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index ec9df6f9cd47..420eb933189c 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -321,7 +321,11 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover) { + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + + if (leftover < sizeof(mc_header)) + break; + mc_header = (struct microcode_header_intel *)ucode_ptr; mc_size = get_totalsize(mc_header); diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9e451b0876b5..f8c81ba0b465 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -138,8 +138,8 @@ static void prepare_set(void) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* @@ -171,7 +171,7 @@ static void post_set(void) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); } static void cyrix_set_arr(unsigned int reg, unsigned long base, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0e25a1bc5ab5..7d74f7b3c6ba 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ @@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 143e5f5dc855..e0dab5ce61e9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,8 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> @@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -395,39 +399,41 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; - /* - * check that PEBS LBR correction does not conflict with - * whatever the user is asking with attr->branch_sample_type - */ - if (event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) { - u64 *br_type = &event->attr.branch_sample_type; - - if (has_branch_stack(event)) { - if (!precise_br_compat(event)) - return -EOPNOTSUPP; - - /* branch_sample_type is compatible */ - - } else { - /* - * user did not specify branch_sample_type - * - * For PEBS fixups, we capture all - * the branches at the priv level of the - * event. - */ - *br_type = PERF_SAMPLE_BRANCH_ANY; - - if (!event->attr.exclude_user) - *br_type |= PERF_SAMPLE_BRANCH_USER; - - if (!event->attr.exclude_kernel) - *br_type |= PERF_SAMPLE_BRANCH_KERNEL; - } + } + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } + if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) + event->attach_state |= PERF_ATTACH_TASK_DATA; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -1327,8 +1333,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - if (x86_pmu.attr_rdpmc) - set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1804,14 +1808,44 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } + if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) + event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + return err; } +static void refresh_pce(void *ignored) +{ + if (current->mm) + load_mm_cr4(current->mm); +} + +static void x86_pmu_event_mapped(struct perf_event *event) +{ + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + +static void x86_pmu_event_unmapped(struct perf_event *event) +{ + if (!current->mm) + return; + + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; - if (!x86_pmu.attr_rdpmc) + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { @@ -1829,16 +1863,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev, return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } -static void change_rdpmc(void *info) -{ - bool enable = !!(unsigned long)info; - - if (enable) - set_in_cr4(X86_CR4_PCE); - else - clear_in_cr4(X86_CR4_PCE); -} - static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) @@ -1850,14 +1874,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (ret) return ret; + if (val > 2) + return -EINVAL; + if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; - if (!!val != !!x86_pmu.attr_rdpmc) { - x86_pmu.attr_rdpmc = !!val; - on_each_cpu(change_rdpmc, (void *)val, 1); + if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { + /* + * Changing into or out of always available, aka + * perf-event-bypassing mode. This path is extremely slow, + * but only root can trigger it, so it's okay. + */ + if (val == 2) + static_key_slow_inc(&rdpmc_always_available); + else + static_key_slow_dec(&rdpmc_always_available); + on_each_cpu(refresh_pce, NULL, 1); } + x86_pmu.attr_rdpmc = val; + return count; } @@ -1879,10 +1916,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_flush_branch_stack(void) +static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.flush_branch_stack) - x86_pmu.flush_branch_stack(); + if (x86_pmu.sched_task) + x86_pmu.sched_task(ctx, sched_in); } void perf_check_microcode(void) @@ -1900,6 +1937,9 @@ static struct pmu pmu = { .event_init = x86_pmu_event_init, + .event_mapped = x86_pmu_event_mapped, + .event_unmapped = x86_pmu_event_unmapped, + .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, @@ -1911,16 +1951,19 @@ static struct pmu pmu = { .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, - .flush_branch_stack = x86_pmu_flush_branch_stack, + .sched_task = x86_pmu_sched_task, + .task_ctx_size = sizeof(struct x86_perf_task_context), }; -void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data *data; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; - userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_rdpmc = + !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); userpg->pmc_width = x86_pmu.cntval_bits; if (!sched_clock_stable()) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4e6cdb0ddc70..a371d27d6795 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -71,6 +71,8 @@ struct event_constraint { #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ + struct amd_nb { int nb_id; /* NorthBridge id */ @@ -470,7 +472,8 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*flush_branch_stack)(void); + void (*sched_task)(struct perf_event_context *ctx, + bool sched_in); /* * Intel Arch Perfmon v2+ @@ -513,6 +516,13 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +struct x86_perf_task_context { + u64 lbr_from[MAX_LBR_ENTRIES]; + u64 lbr_to[MAX_LBR_ENTRIES]; + int lbr_callstack_users; + int lbr_stack_state; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -544,6 +554,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \ extern struct x86_pmu x86_pmu __read_mostly; +static inline bool x86_pmu_has_lbr_callstack(void) +{ + return x86_pmu.lbr_sel_map && + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; +} + DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); int x86_perf_event_set_period(struct perf_event *event); @@ -725,6 +741,8 @@ void intel_pmu_pebs_disable_all(void); void intel_ds_init(void); +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_pmu_lbr_reset(void); void intel_pmu_lbr_enable(struct perf_event *event); @@ -745,6 +763,8 @@ void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_snb(void); +void intel_pmu_lbr_init_hsw(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); int p4_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a61f5c6911da..989d3c215d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off) * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that * is using the new offset. */ -static int force_ibs_eilvt_setup(void) +static void force_ibs_eilvt_setup(void) { int offset; int ret; @@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void) if (offset == APIC_EILVT_NR_MAX) { printk(KERN_DEBUG "No EILVT entry available\n"); - return -EBUSY; + return; } ret = setup_ibs_ctl(offset); if (ret) goto out; - if (!ibs_eilvt_valid()) { - ret = -EFAULT; + if (!ibs_eilvt_valid()) goto out; - } pr_info("IBS: LVT offset %d assigned\n", offset); - return 0; + return; out: preempt_disable(); put_eilvt(offset); preempt_enable(); - return ret; + return; } static void ibs_eilvt_setup(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 498b6d967138..9f1dd18fa395 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * must disable before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_disable(event); if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * must enabled before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_enable(event); if (event->attr.exclude_host) @@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - if (intel_pmu_needs_lbr_smpl(event)) { + if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; @@ -2044,18 +2030,6 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } -static void intel_pmu_flush_branch_stack(void) -{ - /* - * Intel LBR does not tag entries with the - * PID of the current task, then we need to - * flush it on ctxsw - * For now, we simply reset it - */ - if (x86_pmu.lbr_nr) - intel_pmu_lbr_reset(); -} - PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -2107,7 +2081,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .flush_branch_stack = intel_pmu_flush_branch_stack, + .sched_task = intel_pmu_lbr_sched_task, }; static __init void intel_clovertown_quirk(void) @@ -2549,7 +2523,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_snb(); + intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 58f1a94beaf0..0473874109cb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -39,6 +39,7 @@ static enum { #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ #define LBR_FAR_BIT 8 /* do not capture far branches */ +#define LBR_CALL_STACK_BIT 9 /* enable call stack */ #define LBR_KERNEL (1 << LBR_KERNEL_BIT) #define LBR_USER (1 << LBR_USER_BIT) @@ -49,6 +50,7 @@ static enum { #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) #define LBR_FAR (1 << LBR_FAR_BIT) +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT) #define LBR_PLM (LBR_KERNEL | LBR_USER) @@ -69,33 +71,31 @@ static enum { #define LBR_FROM_FLAG_IN_TX (1ULL << 62) #define LBR_FROM_FLAG_ABORT (1ULL << 61) -#define for_each_branch_sample_type(x) \ - for ((x) = PERF_SAMPLE_BRANCH_USER; \ - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) - /* * x86control flow change classification * x86control flow changes include branches, interrupts, traps, faults */ enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -112,13 +112,15 @@ enum { X86_BR_JMP |\ X86_BR_IRQ |\ X86_BR_ABORT |\ - X86_BR_IND_CALL) + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) #define X86_BR_ANY_CALL \ (X86_BR_CALL |\ X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ X86_BR_SYSCALL |\ X86_BR_IRQ |\ X86_BR_INT) @@ -132,14 +134,23 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); static void __intel_pmu_lbr_enable(void) { - u64 debugctl; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 debugctl, lbr_select = 0; - if (cpuc->lbr_sel) - wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); + if (cpuc->lbr_sel) { + lbr_select = cpuc->lbr_sel->config; + wrmsrl(MSR_LBR_SELECT, lbr_select); + } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + debugctl |= DEBUGCTLMSR_LBR; + /* + * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. + * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions + * may cause superfluous increase/decrease of LBR_TOS. + */ + if (!(lbr_select & LBR_CALL_STACK)) + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } @@ -181,9 +192,116 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } +/* + * TOS = most recently recorded branch + */ +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + return tos; +} + +enum { + LBR_NONE, + LBR_VALID, +}; + +static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0 || + task_ctx->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_NONE; +} + +static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +{ + int i; + unsigned lbr_idx, mask; + u64 tos; + + if (task_ctx->lbr_callstack_users == 0) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + mask = x86_pmu.lbr_nr - 1; + tos = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); + rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + } + task_ctx->lbr_stack_state = LBR_VALID; +} + +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; + + if (!x86_pmu.lbr_nr) + return; + + /* + * If LBR callstack feature is enabled and the stack was saved when + * the task was scheduled out, restore the stack. Otherwise flush + * the LBR stack. + */ + task_ctx = ctx ? ctx->task_ctx_data : NULL; + if (task_ctx) { + if (sched_in) { + __intel_pmu_lbr_restore(task_ctx); + cpuc->lbr_context = ctx; + } else { + __intel_pmu_lbr_save(task_ctx); + } + return; + } + + /* + * When sampling the branck stack in system-wide, it may be + * necessary to flush the stack on context switch. This happens + * when the branch stack does not tag its entries with the pid + * of the current task. Otherwise it becomes impossible to + * associate a branch entry with a task. This ambiguity is more + * likely to appear when the branch stack supports priv level + * filtering and the user sets it to monitor only at the user + * level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch + * stack with branch from multiple tasks. + */ + if (sched_in) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = ctx; + } +} + +static inline bool branch_user_callstack(unsigned br_sel) +{ + return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); +} + void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -198,18 +316,33 @@ void intel_pmu_lbr_enable(struct perf_event *event) } cpuc->br_sel = event->hw.branch_reg.reg; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users++; + } + cpuc->lbr_users++; + perf_sched_cb_inc(event->ctx->pmu); } void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + if (branch_user_callstack(cpuc->br_sel) && event->ctx && + event->ctx->task_ctx_data) { + task_ctx = event->ctx->task_ctx_data; + task_ctx->lbr_callstack_users--; + } + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); @@ -234,18 +367,6 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -/* - * TOS = most recently recorded branch - */ -static inline u64 intel_pmu_lbr_tos(void) -{ - u64 tos; - - rdmsrl(x86_pmu.lbr_tos, tos); - - return tos; -} - static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; @@ -350,7 +471,7 @@ void intel_pmu_lbr_read(void) * - in case there is no HW filter * - in case the HW filter has errata or limitations */ -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) { u64 br_type = event->attr.branch_sample_type; int mask = 0; @@ -387,11 +508,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_COND) mask |= X86_BR_JCC; + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) { + if (!x86_pmu_has_lbr_callstack()) + return -EOPNOTSUPP; + if (mask & ~(X86_BR_USER | X86_BR_KERNEL)) + return -EINVAL; + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET | + X86_BR_CALL_STACK; + } + /* * stash actual user request into reg, it may * be used by fixup code for some CPU */ event->hw.branch_reg.reg = mask; + return 0; } /* @@ -403,14 +534,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) { struct hw_perf_event_extra *reg; u64 br_type = event->attr.branch_sample_type; - u64 mask = 0, m; - u64 v; + u64 mask = 0, v; + int i; - for_each_branch_sample_type(m) { - if (!(br_type & m)) + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & (1ULL << i))) continue; - v = x86_pmu.lbr_sel_map[m]; + v = x86_pmu.lbr_sel_map[i]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; @@ -420,8 +551,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; - /* LBR_SELECT operates in suppress mode so invert mask */ - reg->config = ~mask & x86_pmu.lbr_sel_mask; + /* + * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate + * in suppress mode. So LBR_SELECT should be set to + * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) + */ + reg->config = mask ^ x86_pmu.lbr_sel_mask; return 0; } @@ -439,7 +574,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) /* * setup SW LBR filter */ - intel_pmu_setup_sw_lbr_filter(event); + ret = intel_pmu_setup_sw_lbr_filter(event); + if (ret) + return ret; /* * setup HW LBR filter, if any @@ -568,6 +705,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_INT; break; case 0xe8: /* call near rel */ + insn_get_immediate(&insn); + if (insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } case 0x9a: /* call far absolute */ ret = X86_BR_CALL; break; @@ -678,35 +821,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) /* * Map interface branch filters onto LBR filters */ -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP - | LBR_IND_JMP | LBR_FAR, +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches */ - [PERF_SAMPLE_BRANCH_ANY_CALL] = + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, /* * NHM/WSM erratum: must include IND_JMP to capture IND_CALL */ - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, }; -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, - [PERF_SAMPLE_BRANCH_USER] = LBR_USER, - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL - | LBR_FAR, - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, +}; + +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL + | LBR_RETURN | LBR_CALL_STACK, }; /* core */ @@ -765,6 +922,20 @@ void __init intel_pmu_lbr_init_snb(void) pr_cont("16-deep LBR, "); } +/* haswell */ +void intel_pmu_lbr_init_hsw(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + pr_cont("16-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 21af6149edf2..12d9548457e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid) } } - if (ubox_dev) - pci_dev_put(ubox_dev); + pci_dev_put(ubox_dev); return err ? pcibios_err_to_errno(err) : 0; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b74ebc7c4402..cf3df1d8d039 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err) printk("SMP "); #endif #ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); + printk("DEBUG_PAGEALLOC "); +#endif +#ifdef CONFIG_KASAN + printk("KASAN"); #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 01d1c187c9f9..a62536a1be88 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -19,6 +19,7 @@ #include <linux/usb/ehci_def.h> #include <linux/efi.h> #include <asm/efi.h> +#include <asm/pci_x86.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -76,7 +77,7 @@ static struct console early_vga_console = { /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ -static int early_serial_base = 0x3f8; /* ttyS0 */ +static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define XMTRDY 0x20 @@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + +static unsigned int io_serial_in(unsigned long addr, int offset) +{ + return inb(addr + offset); +} + +static void io_serial_out(unsigned long addr, int offset, int value) +{ + outb(value, addr + offset); +} + +static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; +static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; + static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - outb(ch, early_serial_base + TXR); + serial_out(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n) } } +static __init void early_serial_hw_init(unsigned divisor) +{ + unsigned char c; + + serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ + serial_out(early_serial_base, IER, 0); /* no interrupt */ + serial_out(early_serial_base, FCR, 0); /* no fifo */ + serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + + c = serial_in(early_serial_base, LCR); + serial_out(early_serial_base, LCR, c | DLAB); + serial_out(early_serial_base, DLL, divisor & 0xff); + serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); + serial_out(early_serial_base, LCR, c & ~DLAB); +} + #define DEFAULT_BAUD 9600 static __init void early_serial_init(char *s) { - unsigned char c; unsigned divisor; - unsigned baud = DEFAULT_BAUD; + unsigned long baud = DEFAULT_BAUD; char *e; if (*s == ',') @@ -145,24 +188,124 @@ static __init void early_serial_init(char *s) s++; } - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + if (*s) { + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* These will always be IO based ports */ + serial_in = io_serial_in; + serial_out = io_serial_out; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} + +#ifdef CONFIG_PCI +/* + * early_pci_serial_init() + * + * This function is invoked when the early_printk param starts with "pciserial" + * The rest of the param should be ",B:D.F,baud" where B, D & F describe the + * location of a PCI device that must be a UART device. + */ +static __init void early_pci_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + u8 bus, slot, func; + uint32_t classcode, bar0; + uint16_t cmdreg; + char *e; + + + /* + * First, part the param to get the BDF values + */ + if (*s == ',') + ++s; + + if (*s == 0) + return; + + bus = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != ':') + return; + ++s; + slot = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != '.') + return; + ++s; + func = (u8)simple_strtoul(s, &e, 16); + s = e; + + /* A baud might be following */ + if (*s == ',') + s++; + + /* + * Second, find the device from the BDF + */ + cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); + classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + + /* + * Verify it is a UART type device + */ + if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && + (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || + (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ + return; + + /* + * Determine if it is IO or memory mapped + */ + if (bar0 & 0x01) { + /* it is IO mapped */ + serial_in = io_serial_in; + serial_out = io_serial_out; + early_serial_base = bar0&0xfffffffc; + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_IO); + } else { + /* It is memory mapped - assume 32-bit alignment */ + serial_in = mem32_serial_in; + serial_out = mem32_serial_out; + /* WARNING! assuming the address is always in the first 4G */ + early_serial_base = + (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_MEMORY); + } + /* + * Lastly, initalize the hardware + */ if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) + if (strcmp(s, "nocfg") == 0) + /* Sometimes, we want to leave the UART alone + * and assume the BIOS has set it up correctly. + * "nocfg" tells us this is the case, and we + * should do no more setup. + */ + return; + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) baud = DEFAULT_BAUD; } + /* Convert from baud to divisor value */ divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); + + /* Set up the HW */ + early_serial_hw_init(divisor); } +#endif static struct console early_serial_console = { .name = "earlyser", @@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf) early_serial_init(buf + 4); early_console_register(&early_serial_console, keep); } +#ifdef CONFIG_PCI + if (!strncmp(buf, "pciserial", 9)) { + early_pci_serial_init(buf + 9); + early_console_register(&early_serial_console, keep); + buf += 9; /* Keep from match the above "serial" */ + } +#endif if (!strncmp(buf, "vga", 3) && boot_params.screen_info.orig_video_isVGA == 1) { max_xpos = boot_params.screen_info.orig_video_cols; @@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf) early_console_register(&xenboot_console, keep); #endif #ifdef CONFIG_EARLY_PRINTK_INTEL_MID - if (!strncmp(buf, "mrst", 4)) { - mrst_early_console_init(); - early_console_register(&early_mrst_console, keep); - } - if (!strncmp(buf, "hsu", 3)) { hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6c1b9836995..2911ef3a9f1c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { + cr4_init_shadow(); sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index eda1a865641e..c4f8d4659070 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,6 +27,7 @@ #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> +#include <asm/kasan.h> /* * Manage page tables very early on. @@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void) next_early_pgt = 0; - write_cr3(__pa(early_level4_pgt)); + write_cr3(__pa_nodebug(early_level4_pgt)); } /* Create a new PMD entry */ @@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address) pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) return -1; again: @@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + cr4_init_shadow(); + /* Kill off the identity-map trampoline */ reset_early_page_tables(); + kasan_map_early_shadow(early_level4_pgt); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); @@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; + kasan_map_early_shadow(init_level4_pgt); + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a468c0a65c42..6fd514d9f69a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -514,8 +514,38 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#ifdef CONFIG_KASAN +#define FILL(VAL, COUNT) \ + .rept (COUNT) ; \ + .quad (VAL) ; \ + .endr + +NEXT_PAGE(kasan_zero_pte) + FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pmd) + FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pud) + FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) + +#undef FILL +#endif + + #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE + +#ifdef CONFIG_KASAN +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +NEXT_PAGE(kasan_zero_page) + .skip PAGE_SIZE +#endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 319bcb9372fe..3acbff4716b0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line) #define hpet_print_config() \ do { \ if (hpet_verbose) \ - _hpet_print_config(__FUNCTION__, __LINE__); \ + _hpet_print_config(__func__, __LINE__); \ } while (0) /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81049ffab2d6..d5651fce0b71 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -13,6 +13,7 @@ #include <asm/sigcontext.h> #include <asm/processor.h> #include <asm/math_emu.h> +#include <asm/tlbflush.h> #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/i387.h> @@ -193,7 +194,7 @@ void fpu_init(void) if (cpu_has_xmm) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) - set_in_cr4(cr4_mask); + cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 705ef8d48e2d..67b1cbe0093a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void) irq = __this_cpu_read(vector_irq[vector]); if (irq >= 0) { desc = irq_to_desc(irq); + if (!desc) + continue; + data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpu_clear(this_cpu, affinity_new); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3aef248ec1ee..7b3b9d15c47a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -324,7 +324,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) * Target instructions MUST be relocatable (checked inside) * This is called when new aggr(opt)probe is allocated or reused. */ -int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) { u8 *buf; int ret; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 94f643484300..e354cc6446ab 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -609,7 +609,7 @@ static inline void check_zero(void) u8 ret; u8 old; - old = ACCESS_ONCE(zero_stats); + old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) int cpu; u64 start; unsigned long flags; + __ticket_t head; if (in_nmi()) return; @@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking. */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) add_stats(RELEASED_SLOW, 1); for_each_cpu(cpu, &waiting_cpus) { const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == ticket) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == ticket) { add_stats(RELEASED_SLOW_KICKED, 1); kvm_kick_cpu(cpu); break; diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 000000000000..ff3c3101d003 --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,90 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/page_types.h> +#include <asm/elf.h> +#include <asm/livepatch.h> + +/** + * klp_write_module_reloc() - write a relocation in a module + * @mod: module in which the section to be modified is found + * @type: ELF relocation type (see asm/elf.h) + * @loc: address that the relocation should be written to + * @value: relocation value (sym address + addend) + * + * This function writes a relocation to the specified location for + * a particular module. + */ +int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value) +{ + int ret, numpages, size = 4; + bool readonly; + unsigned long val; + unsigned long core = (unsigned long)mod->module_core; + unsigned long core_ro_size = mod->core_ro_size; + unsigned long core_size = mod->core_size; + + switch (type) { + case R_X86_64_NONE: + return 0; + case R_X86_64_64: + val = value; + size = 8; + break; + case R_X86_64_32: + val = (u32)value; + break; + case R_X86_64_32S: + val = (s32)value; + break; + case R_X86_64_PC32: + val = (u32)(value - loc); + break; + default: + /* unsupported relocation type */ + return -EINVAL; + } + + if (loc < core || loc >= core + core_size) + /* loc does not point to any symbol inside the module */ + return -EINVAL; + + if (loc < core + core_ro_size) + readonly = true; + else + readonly = false; + + /* determine if the relocation spans a page boundary */ + numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; + + if (readonly) + set_memory_rw(loc & PAGE_MASK, numpages); + + ret = probe_kernel_write((void *)loc, &val, size); + + if (readonly) + set_memory_ro(loc & PAGE_MASK, numpages); + + return ret; +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e69f9882bf95..9bbb9b35c144 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/kasan.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> @@ -46,21 +47,13 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; -static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); -static int __init parse_nokaslr(char *p) -{ - randomize_modules = 0; - return 0; -} -early_param("nokaslr", parse_nokaslr); - static unsigned long int get_module_load_offset(void) { - if (randomize_modules) { + if (kaslr_enabled) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this @@ -83,13 +76,22 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + void *p; + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - return __vmalloc_node_range(size, 1, + + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC, NUMA_NO_NODE, + PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { + vfree(p); + return NULL; + } + + return p; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127ddaa2d5a..046e2d620bbe 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include <asm/fpu-internal.h> #include <asm/debugreg.h> #include <asm/nmi.h> +#include <asm/tlbflush.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -141,7 +142,7 @@ void flush_thread(void) static void hard_disable_TSC(void) { - write_cr4(read_cr4() | X86_CR4_TSD); + cr4_set_bits(X86_CR4_TSD); } void disable_TSC(void) @@ -158,7 +159,7 @@ void disable_TSC(void) static void hard_enable_TSC(void) { - write_cr4(read_cr4() & ~X86_CR4_TSD); + cr4_clear_bits(X86_CR4_TSD); } static void enable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8f3ebfe710d0..603c4f99cb5a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4_safe(); + cr4 = __read_cr4_safe(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a2c02913af3..67fcc43577d2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index fe3dbfe0c4a5..cd9685235df9 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now) retval = set_rtc_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", - __FUNCTION__, retval); + __func__, retval); } else { printk(KERN_ERR "%s: Invalid RTC value: write of %lx to RTC failed\n", - __FUNCTION__, nowtime); + __func__, nowtime); retval = -EINVAL; } return retval; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4648adadd7d..98dc9317286e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -89,6 +89,7 @@ #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/bugs.h> +#include <asm/kasan.h> #include <asm/vsyscall.h> #include <asm/cpu.h> @@ -121,6 +122,8 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +bool __read_mostly kaslr_enabled = false; + #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); #endif @@ -424,6 +427,11 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init parse_kaslr_setup(u64 pa_data, u32 data_len) +{ + kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data)); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -449,6 +457,9 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_KASLR: + parse_kaslr_setup(pa_data, data_len); + break; default: break; } @@ -831,10 +842,14 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - pr_emerg("Kernel Offset: 0x%lx from 0x%lx " - "(relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, __START_KERNEL, - __START_KERNEL_map, MODULES_VADDR-1); + if (kaslr_enabled) + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + else + pr_emerg("Kernel Offset: disabled\n"); return 0; } @@ -1174,9 +1189,11 @@ void __init setup_arch(char **cmdline_p) x86_init.paging.pagetable_init(); + kasan_init(); + if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = read_cr4(); + mmu_cr4_features = __read_cr4(); if (trampoline_cr4_features) *trampoline_cr4_features = mmu_cr4_features; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2a33c8f68319..e5042463c1bc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 040681928e9d..37d8fa4438f0 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial); #undef memset #undef memmove +extern void *__memset(void *, int, __kernel_size_t); +extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *__memmove(void *, const void *, __kernel_size_t); extern void *memset(void *, int, __kernel_size_t); extern void *memcpy(void *, const void *, __kernel_size_t); -extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *memmove(void *, const void *, __kernel_size_t); + +EXPORT_SYMBOL(__memset); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(__memmove); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); #ifndef CONFIG_DEBUG_VIRTUAL diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 0de1fae2bdf0..34f66e58a896 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -12,6 +12,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> +#include <asm/tlbflush.h> #include <asm/xcr.h> /* @@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { - set_in_cr4(X86_CR4_OSXSAVE); + cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } |