summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c15
-rw-r--r--arch/x86/kernel/cpu/perf_event.c174
-rw-r--r--arch/x86/kernel/cpu/perf_event.h47
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c269
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c108
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c74
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c6
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S33
-rw-r--r--arch/x86/kernel/head_64.S20
-rw-r--r--arch/x86/kernel/kvm.c43
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c24
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c22
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c22
20 files changed, 815 insertions, 448 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 20190bdac9d5..95cf78d44ab4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -53,9 +53,12 @@
static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
- rcu_dereference_index_check((p), \
- rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_chrdev_read_mutex))
+({ \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious rcu_dereference_check_mce() usage"); \
+ smp_load_acquire(&(p)); \
+})
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -1887,7 +1890,7 @@ out:
static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
- if (rcu_access_index(mcelog.next))
+ if (READ_ONCE(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
@@ -1932,8 +1935,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
}
EXPORT_SYMBOL_GPL(register_mce_write_callback);
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
{
if (mce_write)
return mce_write(filp, ubuf, usize, off);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 87848ebe2bb7..5801a14f7524 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
}
static atomic_t active_events;
+static atomic_t pmc_refcount;
static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -190,6 +191,7 @@ static bool check_hw_exists(void)
u64 val, val_fail, val_new= ~0;
int i, reg, reg_fail, ret = 0;
int bios_fail = 0;
+ int reg_safe = -1;
/*
* Check to see if the BIOS enabled any of the counters, if so
@@ -204,6 +206,8 @@ static bool check_hw_exists(void)
bios_fail = 1;
val_fail = val;
reg_fail = reg;
+ } else {
+ reg_safe = i;
}
}
@@ -222,11 +226,22 @@ static bool check_hw_exists(void)
}
/*
+ * If all the counters are enabled, the below test will always
+ * fail. The tools will also become useless in this scenario.
+ * Just fail and disable the hardware counters.
+ */
+
+ if (reg_safe == -1) {
+ reg = reg_safe;
+ goto msr_fail;
+ }
+
+ /*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
- reg = x86_pmu_event_addr(0);
+ reg = x86_pmu_event_addr(reg_safe);
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
@@ -256,11 +271,8 @@ msr_fail:
static void hw_perf_event_destroy(struct perf_event *event)
{
- if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
- release_pmc_hardware();
- release_ds_buffers();
- mutex_unlock(&pmc_reserve_mutex);
- }
+ x86_release_hardware();
+ atomic_dec(&active_events);
}
void hw_perf_lbr_event_destroy(struct perf_event *event)
@@ -310,6 +322,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+int x86_reserve_hardware(void)
+{
+ int err = 0;
+
+ if (!atomic_inc_not_zero(&pmc_refcount)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&pmc_refcount) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ reserve_ds_buffers();
+ }
+ if (!err)
+ atomic_inc(&pmc_refcount);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
+ return err;
+}
+
+void x86_release_hardware(void)
+{
+ if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ release_ds_buffers();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
/*
* Check if we can create event of a certain type (that no conflicting events
* are present).
@@ -322,21 +363,34 @@ int x86_add_exclusive(unsigned int what)
return 0;
mutex_lock(&pmc_reserve_mutex);
- for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
goto out;
+ }
atomic_inc(&x86_pmu.lbr_exclusive[what]);
ret = 0;
out:
mutex_unlock(&pmc_reserve_mutex);
+
+ /*
+ * Assuming that all exclusive events will share the PMI handler
+ * (which checks active_events for whether there is work to do),
+ * we can bump active_events counter right here, except for
+ * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init()
+ * path, which already bumps active_events for them.
+ */
+ if (!ret && what != x86_lbr_exclusive_lbr)
+ atomic_inc(&active_events);
+
return ret;
}
void x86_del_exclusive(unsigned int what)
{
atomic_dec(&x86_pmu.lbr_exclusive[what]);
+ atomic_dec(&active_events);
}
int x86_setup_perfctr(struct perf_event *event)
@@ -513,22 +567,11 @@ static int __x86_pmu_event_init(struct perf_event *event)
if (!x86_pmu_initialized())
return -ENODEV;
- err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
- if (!reserve_pmc_hardware())
- err = -EBUSY;
- else
- reserve_ds_buffers();
- }
- if (!err)
- atomic_inc(&active_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
+ err = x86_reserve_hardware();
if (err)
return err;
+ atomic_inc(&active_events);
event->destroy = hw_perf_event_destroy;
event->hw.idx = -1;
@@ -611,6 +654,7 @@ struct sched_state {
int event; /* event index */
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
+ int nr_gp; /* number of GP counters used */
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};
@@ -620,27 +664,29 @@ struct sched_state {
struct perf_sched {
int max_weight;
int max_events;
- struct perf_event **events;
- struct sched_state state;
+ int max_gp;
int saved_states;
+ struct event_constraint **constraints;
+ struct sched_state state;
struct sched_state saved[SCHED_STATES_MAX];
};
/*
* Initialize interator that runs through all events and counters.
*/
-static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
- int num, int wmin, int wmax)
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+ int num, int wmin, int wmax, int gpmax)
{
int idx;
memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
- sched->events = events;
+ sched->max_gp = gpmax;
+ sched->constraints = constraints;
for (idx = 0; idx < num; idx++) {
- if (events[idx]->hw.constraint->weight == wmin)
+ if (constraints[idx]->weight == wmin)
break;
}
@@ -687,7 +733,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
if (sched->state.event >= sched->max_events)
return false;
- c = sched->events[sched->state.event]->hw.constraint;
+ c = sched->constraints[sched->state.event];
/* Prefer fixed purpose counters */
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
idx = INTEL_PMC_IDX_FIXED;
@@ -696,11 +742,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
goto done;
}
}
+
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
- if (!__test_and_set_bit(idx, sched->state.used))
+ if (!__test_and_set_bit(idx, sched->state.used)) {
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
goto done;
+ }
}
return false;
@@ -745,7 +796,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
if (sched->state.weight > sched->max_weight)
return false;
}
- c = sched->events[sched->state.event]->hw.constraint;
+ c = sched->constraints[sched->state.event];
} while (c->weight != sched->state.weight);
sched->state.counter = 0; /* start with first counter */
@@ -756,12 +807,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
/*
* Assign a counter for each event.
*/
-int perf_assign_events(struct perf_event **events, int n,
- int wmin, int wmax, int *assign)
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;
- perf_sched_init(&sched, events, n, wmin, wmax);
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
do {
if (!perf_sched_find_counter(&sched))
@@ -788,9 +839,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
x86_pmu.start_scheduling(cpuc);
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- hwc = &cpuc->event_list[i]->hw;
+ cpuc->event_constraint[i] = NULL;
c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
- hwc->constraint = c;
+ cpuc->event_constraint[i] = c;
wmin = min(wmin, c->weight);
wmax = max(wmax, c->weight);
@@ -801,7 +852,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
*/
for (i = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = hwc->constraint;
+ c = cpuc->event_constraint[i];
/* never assigned */
if (hwc->idx == -1)
@@ -821,9 +872,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
}
/* slow path */
- if (i != n)
- unsched = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
+ if (i != n) {
+ int gpmax = x86_pmu.num_counters;
+
+ /*
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;
+
+ unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+ wmax, gpmax, assign);
+ }
/*
* In case of success (unsched = 0), mark events as committed,
@@ -840,12 +908,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
if (x86_pmu.commit_scheduling)
- x86_pmu.commit_scheduling(cpuc, e, assign[i]);
+ x86_pmu.commit_scheduling(cpuc, i, assign[i]);
}
- }
-
- if (!assign || unsched) {
-
+ } else {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -1058,13 +1123,16 @@ int x86_perf_event_set_period(struct perf_event *event)
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
- /*
- * The hw event starts counting from this event offset,
- * mark it to be able to extra future deltas:
- */
- local64_set(&hwc->prev_count, (u64)-left);
+ if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+ local64_read(&hwc->prev_count) != (u64)-left) {
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);
- wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ }
/*
* Due to erratum on certan cpu we need
@@ -1292,8 +1360,10 @@ static void x86_pmu_del(struct perf_event *event, int flags)
x86_pmu.put_event_constraints(cpuc, event);
/* Delete the array entry. */
- while (++i < cpuc->n_events)
+ while (++i < cpuc->n_events) {
cpuc->event_list[i-1] = cpuc->event_list[i];
+ cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+ }
--cpuc->n_events;
perf_event_update_userpage(event);
@@ -1374,6 +1444,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
u64 finish_clock;
int ret;
+ /*
+ * All PMUs/events that share this PMI handler should make sure to
+ * increment active_events for their events.
+ */
if (!atomic_read(&active_events))
return NMI_DONE;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6ac5cb7a9e14..3e7fd27dfe20 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -74,6 +74,9 @@ struct event_constraint {
#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */
struct amd_nb {
@@ -87,6 +90,18 @@ struct amd_nb {
#define MAX_PEBS_EVENTS 8
/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+ PERF_SAMPLE_TRANSACTION)
+
+/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
@@ -132,10 +147,7 @@ enum intel_excl_state_type {
};
struct intel_excl_states {
- enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
enum intel_excl_state_type state[X86_PMC_IDX_MAX];
- int num_alloc_cntrs;/* #counters allocated */
- int max_alloc_cntrs;/* max #counters allowed */
bool sched_started; /* true if scheduling has started */
};
@@ -144,6 +156,11 @@ struct intel_excl_cntrs {
struct intel_excl_states states[2];
+ union {
+ u16 has_exclusive[2];
+ u32 exclusive_present;
+ };
+
int refcnt; /* per-core: #HT threads */
unsigned core_id; /* per-core: core id */
};
@@ -172,7 +189,11 @@ struct cpu_hw_events {
added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
+
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+ struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+ int n_excl; /* the number of exclusive events */
unsigned int group_flag;
int is_fake;
@@ -519,12 +540,10 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
- void (*commit_scheduling)(struct cpu_hw_events *cpuc,
- struct perf_event *event,
- int cntr);
-
void (*start_scheduling)(struct cpu_hw_events *cpuc);
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
void (*stop_scheduling)(struct cpu_hw_events *cpuc);
struct event_constraint *event_constraints;
@@ -697,6 +716,10 @@ int x86_add_exclusive(unsigned int what);
void x86_del_exclusive(unsigned int what);
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
void hw_perf_lbr_event_destroy(struct perf_event *event);
int x86_setup_perfctr(struct perf_event *event);
@@ -717,8 +740,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
void x86_pmu_enable_all(int added);
-int perf_assign_events(struct perf_event **events, int n,
- int wmin, int wmax, int *assign);
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
void x86_pmu_stop(struct perf_event *event, int flags);
@@ -860,6 +883,8 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_ds_init(void);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
@@ -929,4 +954,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
return NULL;
}
+static inline int is_ht_workaround_enabled(void)
+{
+ return 0;
+}
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 324817735771..b9826a981fb2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1903,9 +1903,8 @@ static void
intel_start_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */
/*
* nothing needed if in group validation mode
@@ -1916,35 +1915,52 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];
xl->sched_started = true;
- xl->num_alloc_cntrs = 0;
/*
* lock shared state until we are done scheduling
* in stop_event_scheduling()
* makes scheduling appear as a transaction
*/
- WARN_ON_ONCE(!irqs_disabled());
raw_spin_lock(&excl_cntrs->lock);
+}
- /*
- * save initial state of sibling thread
- */
- memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = cpuc->event_constraint[idx];
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ lockdep_assert_held(&excl_cntrs->lock);
+
+ if (c->flags & PERF_X86_EVENT_EXCL)
+ xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xl->state[cntr] = INTEL_EXCL_SHARED;
}
static void
intel_stop_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */
/*
* nothing needed if in group validation mode
@@ -1954,17 +1970,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];
- /*
- * make new sibling thread state visible
- */
- memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
xl->sched_started = false;
/*
* release shared state lock (acquired in intel_start_scheduling())
@@ -1976,12 +1986,10 @@ static struct event_constraint *
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
int idx, struct event_constraint *c)
{
- struct event_constraint *cx;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
- int is_excl, i;
+ struct intel_excl_states *xlo;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* alternate */
+ int is_excl, i;
/*
* validating a group does not require
@@ -1993,34 +2001,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return c;
- /*
- * event requires exclusive counter access
- * across HT threads
- */
- is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
- /*
- * xl = state of current HT
- * xlo = state of sibling HT
- */
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
-
- /*
- * do not allow scheduling of more than max_alloc_cntrs
- * which is set to half the available generic counters.
- * this helps avoid counter starvation of sibling thread
- * by ensuring at most half the counters cannot be in
- * exclusive mode. There is not designated counters for the
- * limits. Any N/2 counters can be used. This helps with
- * events with specifix counter constraints
- */
- if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
- return &emptyconstraint;
-
- cx = c;
/*
* because we modify the constraint, we need
@@ -2031,10 +2013,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* been cloned (marked dynamic)
*/
if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
- /* sanity check */
- if (idx < 0)
- return &emptyconstraint;
+ struct event_constraint *cx;
/*
* grab pre-allocated constraint entry
@@ -2045,13 +2024,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* initialize dynamic constraint
* with static constraint
*/
- memcpy(cx, c, sizeof(*cx));
+ *cx = *c;
/*
* mark constraint as dynamic, so we
* can free it later on
*/
cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
}
/*
@@ -2062,6 +2042,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
*/
/*
+ * state of sibling HT
+ */
+ xlo = &excl_cntrs->states[tid ^ 1];
+
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+ event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }
+
+ /*
* Modify static constraint with current dynamic
* state of thread
*
@@ -2069,44 +2065,44 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* SHARED : sibling counter measuring non-exclusive event
* UNUSED : sibling counter unused
*/
- for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
/*
* exclusive event in sibling counter
* our corresponding counter cannot be used
* regardless of our event
*/
- if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
- __clear_bit(i, cx->idxmsk);
+ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, c->idxmsk);
/*
* if measuring an exclusive event, sibling
* measuring non-exclusive, then counter cannot
* be used
*/
- if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
- __clear_bit(i, cx->idxmsk);
+ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, c->idxmsk);
}
/*
* recompute actual bit weight for scheduling algorithm
*/
- cx->weight = hweight64(cx->idxmsk64);
+ c->weight = hweight64(c->idxmsk64);
/*
* if we return an empty mask, then switch
* back to static empty constraint to avoid
* the cost of freeing later on
*/
- if (cx->weight == 0)
- cx = &emptyconstraint;
+ if (c->weight == 0)
+ c = &emptyconstraint;
- return cx;
+ return c;
}
static struct event_constraint *
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
- struct event_constraint *c1 = event->hw.constraint;
+ struct event_constraint *c1 = cpuc->event_constraint[idx];
struct event_constraint *c2;
/*
@@ -2132,10 +2128,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
{
struct hw_perf_event *hwc = &event->hw;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xlo, *xl;
- unsigned long flags = 0; /* keep compiler happy */
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
+ struct intel_excl_states *xl;
/*
* nothing needed if in group validation mode
@@ -2143,31 +2137,35 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
if (cpuc->is_fake)
return;
- WARN_ON_ONCE(!excl_cntrs);
-
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
+ if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+ hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+ if (!--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+ }
/*
- * put_constraint may be called from x86_schedule_events()
- * which already has the lock held so here make locking
- * conditional
+ * If event was actually assigned, then mark the counter state as
+ * unused now.
*/
- if (!xl->sched_started)
- raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+ if (hwc->idx >= 0) {
+ xl = &excl_cntrs->states[tid];
- /*
- * if event was actually assigned, then mark the
- * counter state as unused now
- */
- if (hwc->idx >= 0)
- xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional.
+ */
+ if (!xl->sched_started)
+ raw_spin_lock(&excl_cntrs->lock);
- if (!xl->sched_started)
- raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+ xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock(&excl_cntrs->lock);
+ }
}
static void
@@ -2188,8 +2186,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct event_constraint *c = event->hw.constraint;
-
intel_put_shared_regs_event_constraints(cpuc, event);
/*
@@ -2197,48 +2193,8 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
* all events are subject to and must call the
* put_excl_constraints() routine
*/
- if (c && cpuc->excl_cntrs)
+ if (cpuc->excl_cntrs)
intel_put_excl_constraints(cpuc, event);
-
- /* cleanup dynamic constraint */
- if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
- event->hw.constraint = NULL;
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
- struct perf_event *event, int cntr)
-{
- struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct event_constraint *c = event->hw.constraint;
- struct intel_excl_states *xlo, *xl;
- int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
- int is_excl;
-
- if (cpuc->is_fake || !c)
- return;
-
- is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
- if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
- return;
-
- WARN_ON_ONCE(!excl_cntrs);
-
- if (!excl_cntrs)
- return;
-
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
-
- WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
- if (cntr >= 0) {
- if (is_excl)
- xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
- else
- xlo->init_state[cntr] = INTEL_EXCL_SHARED;
- }
}
static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2304,8 +2260,15 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (ret)
return ret;
- if (event->attr.precise_ip && x86_pmu.pebs_aliases)
- x86_pmu.pebs_aliases(event);
+ if (event->attr.precise_ip) {
+ if (!event->attr.freq) {
+ event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+ if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+ event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+ }
+ if (x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
+ }
if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -2554,19 +2517,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
{
struct intel_excl_cntrs *c;
- int i;
c = kzalloc_node(sizeof(struct intel_excl_cntrs),
GFP_KERNEL, cpu_to_node(cpu));
if (c) {
raw_spin_lock_init(&c->lock);
- for (i = 0; i < X86_PMC_IDX_MAX; i++) {
- c->states[0].state[i] = INTEL_EXCL_UNUSED;
- c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
- c->states[1].state[i] = INTEL_EXCL_UNUSED;
- c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
- }
c->core_id = -1;
}
return c;
@@ -2639,8 +2594,6 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
- int h = x86_pmu.num_counters >> 1;
-
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
struct intel_excl_cntrs *c;
@@ -2654,11 +2607,6 @@ static void intel_pmu_cpu_starting(int cpu)
}
cpuc->excl_cntrs->core_id = core_id;
cpuc->excl_cntrs->refcnt++;
- /*
- * set hard limit to half the number of generic counters
- */
- cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
- cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
}
}
@@ -2694,6 +2642,15 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (x86_pmu.pebs_active)
+ intel_pmu_pebs_sched_task(ctx, sched_in);
+ if (x86_pmu.lbr_nr)
+ intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2783,7 +2740,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .sched_task = intel_pmu_lbr_sched_task,
+ .sched_task = intel_pmu_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2956,8 +2913,8 @@ static __init void intel_ht_bug(void)
{
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
- x86_pmu.commit_scheduling = intel_commit_scheduling;
x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
x86_pmu.stop_scheduling = intel_stop_scheduling;
}
@@ -3270,6 +3227,8 @@ __init int intel_pmu_init(void)
case 61: /* 14nm Broadwell Core-M */
case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -3339,13 +3298,13 @@ __init int intel_pmu_init(void)
* counter, so do not extend mask to generic counters
*/
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != FIXED_EVENT_FLAGS
- || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- continue;
+ if (c->cmask == FIXED_EVENT_FLAGS
+ && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
-
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
+ c->idxmsk64 &=
+ ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+ c->weight = hweight64(c->idxmsk64);
}
}
@@ -3413,8 +3372,8 @@ static __init int fixup_ht_bug(void)
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
- x86_pmu.commit_scheduling = NULL;
x86_pmu.start_scheduling = NULL;
+ x86_pmu.commit_scheduling = NULL;
x86_pmu.stop_scheduling = NULL;
watchdog_nmi_enable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index ac1f0c55f379..7795f3f8b1d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -483,17 +483,26 @@ static int bts_event_add(struct perf_event *event, int mode)
static void bts_event_destroy(struct perf_event *event)
{
+ x86_release_hardware();
x86_del_exclusive(x86_lbr_exclusive_bts);
}
static int bts_event_init(struct perf_event *event)
{
+ int ret;
+
if (event->attr.type != bts_pmu.type)
return -ENOENT;
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
+ ret = x86_reserve_hardware();
+ if (ret) {
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+ return ret;
+ }
+
event->destroy = bts_event_destroy;
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index e4d1b8b738fa..188076161c1b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-struct intel_cqm_state {
- raw_spinlock_t lock;
- int rmid;
- int cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid: The cached Resource Monitoring ID
+ * @closid: The cached Class Of Service ID
+ * @rmid_usecnt: The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 rmid;
+ u32 closid;
+ int rmid_usecnt;
};
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
* near-zero occupancy value, i.e. no cachelines are tagged with this
* RMID, once __intel_cqm_rmid_rotate() returns.
*/
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
#define INVALID_RMID (-1)
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
* Likewise, an rmid value of -1 is used to indicate "no rmid currently
* assigned" and is used as part of the rotation code.
*/
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
{
if (!rmid || rmid == INVALID_RMID)
return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
return true;
}
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
{
u64 val;
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
};
struct cqm_rmid_entry {
- unsigned int rmid;
+ u32 rmid;
enum rmid_recycle_state state;
struct list_head list;
unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
*/
static struct cqm_rmid_entry **cqm_rmid_ptrs;
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
{
struct cqm_rmid_entry *entry;
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
*
* We expect to be called with cache_mutex held.
*/
-static int __get_rmid(void)
+static u32 __get_rmid(void)
{
struct cqm_rmid_entry *entry;
@@ -177,7 +196,7 @@ static int __get_rmid(void)
return entry->rmid;
}
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
{
struct cqm_rmid_entry *entry;
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
}
struct rmid_read {
- unsigned int rmid;
+ u32 rmid;
atomic64_t value;
};
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
/*
* Exchange the RMID of a group of events.
*/
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
{
struct perf_event *event;
- unsigned int old_rmid = group->hw.cqm_rmid;
struct list_head *head = &group->hw.cqm_group_entry;
+ u32 old_rmid = group->hw.cqm_rmid;
lockdep_assert_held(&cache_mutex);
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
* If we have group events waiting for an RMID that don't conflict with
* events already running, assign @rmid.
*/
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
{
struct perf_event *leader, *event;
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
static void __intel_cqm_pick_and_rotate(struct perf_event *next)
{
struct perf_event *rotor;
- unsigned int rmid;
+ u32 rmid;
lockdep_assert_held(&cache_mutex);
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
{
struct perf_event *group, *g;
- unsigned int rmid;
+ u32 rmid;
lockdep_assert_held(&cache_mutex);
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
struct perf_event **group)
{
struct perf_event *iter;
- unsigned int rmid;
bool conflict = false;
+ u32 rmid;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
static void intel_cqm_event_read(struct perf_event *event)
{
unsigned long flags;
- unsigned int rmid;
+ u32 rmid;
u64 val;
/*
@@ -961,55 +979,48 @@ out:
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
- unsigned int rmid = event->hw.cqm_rmid;
- unsigned long flags;
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 rmid = event->hw.cqm_rmid;
if (!(event->hw.cqm_state & PERF_HES_STOPPED))
return;
event->hw.cqm_state &= ~PERF_HES_STOPPED;
- raw_spin_lock_irqsave(&state->lock, flags);
-
- if (state->cnt++)
- WARN_ON_ONCE(state->rmid != rmid);
- else
+ if (state->rmid_usecnt++) {
+ if (!WARN_ON_ONCE(state->rmid != rmid))
+ return;
+ } else {
WARN_ON_ONCE(state->rmid);
+ }
state->rmid = rmid;
- wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
- raw_spin_unlock_irqrestore(&state->lock, flags);
+ wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
}
static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
- unsigned long flags;
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
if (event->hw.cqm_state & PERF_HES_STOPPED)
return;
event->hw.cqm_state |= PERF_HES_STOPPED;
- raw_spin_lock_irqsave(&state->lock, flags);
intel_cqm_event_read(event);
- if (!--state->cnt) {
+ if (!--state->rmid_usecnt) {
state->rmid = 0;
- wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
} else {
WARN_ON_ONCE(!state->rmid);
}
-
- raw_spin_unlock_irqrestore(&state->lock, flags);
}
static int intel_cqm_event_add(struct perf_event *event, int mode)
{
unsigned long flags;
- unsigned int rmid;
+ u32 rmid;
raw_spin_lock_irqsave(&cache_lock, flags);
@@ -1024,11 +1035,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
return 0;
}
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
- intel_cqm_event_stop(event, mode);
-}
-
static void intel_cqm_event_destroy(struct perf_event *event)
{
struct perf_event *group_other = NULL;
@@ -1057,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
list_replace(&event->hw.cqm_groups_entry,
&group_other->hw.cqm_groups_entry);
} else {
- unsigned int rmid = event->hw.cqm_rmid;
+ u32 rmid = event->hw.cqm_rmid;
if (__rmid_valid(rmid))
__put_rmid(rmid);
@@ -1221,7 +1227,7 @@ static struct pmu intel_cqm_pmu = {
.task_ctx_nr = perf_sw_context,
.event_init = intel_cqm_event_init,
.add = intel_cqm_event_add,
- .del = intel_cqm_event_del,
+ .del = intel_cqm_event_stop,
.start = intel_cqm_event_start,
.stop = intel_cqm_event_stop,
.read = intel_cqm_event_read,
@@ -1243,12 +1249,12 @@ static inline void cqm_pick_event_reader(int cpu)
static void intel_cqm_cpu_prepare(unsigned int cpu)
{
- struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
- raw_spin_lock_init(&state->lock);
state->rmid = 0;
- state->cnt = 0;
+ state->closid = 0;
+ state->rmid_usecnt = 0;
WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 813f75d71175..71fc40238843 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
#define BTS_RECORD_SIZE 24
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE PAGE_SIZE
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
@@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
- int max, thresh = 1; /* always use a single PEBS record */
+ int max;
void *buffer, *ibuffer;
if (!x86_pmu.pebs)
@@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu)
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
max * x86_pmu.pebs_record_size;
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- thresh * x86_pmu.pebs_record_size;
-
return 0;
}
@@ -549,6 +546,19 @@ int intel_pmu_drain_bts_buffer(void)
return 1;
}
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+ struct pt_regs regs;
+
+ x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (!sched_in)
+ intel_pmu_drain_pebs_buffer();
+}
+
/*
* PEBS
*/
@@ -684,33 +694,81 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+ return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ bool first_pebs;
+ u64 threshold;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+ first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;
if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled |= 1ULL << 63;
+
+ /*
+ * When the event is constrained enough we can use a larger
+ * threshold and run the event with less frequent PMI.
+ */
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+ if (first_pebs)
+ perf_sched_cb_inc(event->ctx->pmu);
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+ /*
+ * If not all events can use larger buffer,
+ * roll back to threshold = 1
+ */
+ if (!first_pebs &&
+ (ds->pebs_interrupt_threshold > threshold))
+ perf_sched_cb_dec(event->ctx->pmu);
+ }
+
+ /* Use auto-reload if possible to save a MSR write in the PMI */
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ ds->pebs_event_reset[hwc->idx] =
+ (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+ }
+
+ if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+ ds->pebs_interrupt_threshold = threshold;
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
- if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
- else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
+ if (ds->pebs_interrupt_threshold >
+ ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
+ intel_pmu_drain_pebs_buffer();
+ if (!pebs_is_enabled(cpuc))
+ perf_sched_cb_dec(event->ctx->pmu);
+ }
+
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -846,8 +904,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
return txn;
}
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs, void *__pebs)
+static void setup_pebs_sample_data(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
#define PERF_X86_EVENT_PEBS_HSW_PREC \
(PERF_X86_EVENT_PEBS_ST_HSW | \
@@ -859,13 +919,11 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
*/
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_record_hsw *pebs = __pebs;
- struct perf_sample_data data;
- struct pt_regs regs;
u64 sample_type;
int fll, fst, dsrc;
int fl = event->hw.flags;
- if (!intel_pmu_save_and_restart(event))
+ if (pebs == NULL)
return;
sample_type = event->attr.sample_type;
@@ -874,15 +932,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
- perf_sample_data_init(&data, 0, event->hw.last_period);
+ perf_sample_data_init(data, 0, event->hw.last_period);
- data.period = event->hw.last_period;
+ data->period = event->hw.last_period;
/*
* Use latency for weight (only avail with PEBS-LL)
*/
if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data.weight = pebs->lat;
+ data->weight = pebs->lat;
/*
* data.data_src encodes the data source
@@ -895,7 +953,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
val = precise_datala_hsw(event, pebs->dse);
else if (fst)
val = precise_store_data(pebs->dse);
- data.data_src.val = val;
+ data->data_src.val = val;
}
/*
@@ -908,61 +966,123 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
* A possible PERF_SAMPLE_REGS will have to transfer all regs.
*/
- regs = *iregs;
- regs.flags = pebs->flags;
- set_linear_ip(&regs, pebs->ip);
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
+ *regs = *iregs;
+ regs->flags = pebs->flags;
+ set_linear_ip(regs, pebs->ip);
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
if (sample_type & PERF_SAMPLE_REGS_INTR) {
- regs.ax = pebs->ax;
- regs.bx = pebs->bx;
- regs.cx = pebs->cx;
- regs.dx = pebs->dx;
- regs.si = pebs->si;
- regs.di = pebs->di;
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
-
- regs.flags = pebs->flags;
+ regs->ax = pebs->ax;
+ regs->bx = pebs->bx;
+ regs->cx = pebs->cx;
+ regs->dx = pebs->dx;
+ regs->si = pebs->si;
+ regs->di = pebs->di;
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
+
+ regs->flags = pebs->flags;
#ifndef CONFIG_X86_32
- regs.r8 = pebs->r8;
- regs.r9 = pebs->r9;
- regs.r10 = pebs->r10;
- regs.r11 = pebs->r11;
- regs.r12 = pebs->r12;
- regs.r13 = pebs->r13;
- regs.r14 = pebs->r14;
- regs.r15 = pebs->r15;
+ regs->r8 = pebs->r8;
+ regs->r9 = pebs->r9;
+ regs->r10 = pebs->r10;
+ regs->r11 = pebs->r11;
+ regs->r12 = pebs->r12;
+ regs->r13 = pebs->r13;
+ regs->r14 = pebs->r14;
+ regs->r15 = pebs->r15;
#endif
}
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
- regs.ip = pebs->real_ip;
- regs.flags |= PERF_EFLAGS_EXACT;
- } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
- regs.flags |= PERF_EFLAGS_EXACT;
+ regs->ip = pebs->real_ip;
+ regs->flags |= PERF_EFLAGS_EXACT;
+ } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+ regs->flags |= PERF_EFLAGS_EXACT;
else
- regs.flags &= ~PERF_EFLAGS_EXACT;
+ regs->flags &= ~PERF_EFLAGS_EXACT;
if ((sample_type & PERF_SAMPLE_ADDR) &&
x86_pmu.intel_cap.pebs_format >= 1)
- data.addr = pebs->dla;
+ data->addr = pebs->dla;
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data.weight = intel_hsw_weight(pebs);
+ data->weight = intel_hsw_weight(pebs);
if (sample_type & PERF_SAMPLE_TRANSACTION)
- data.txn = intel_hsw_transaction(pebs);
+ data->txn = intel_hsw_transaction(pebs);
}
if (has_branch_stack(event))
- data.br_stack = &cpuc->lbr_stack;
+ data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ void *at;
+ u64 pebs_status;
+
+ if (base == NULL)
+ return NULL;
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
- if (perf_event_overflow(event, &data, &regs))
+ if (test_bit(bit, (unsigned long *)&p->status)) {
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ return at;
+
+ if (p->status == (1 << bit))
+ return at;
+
+ /* clear non-PEBS bit and re-check */
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+ if (pebs_status == (1 << bit))
+ return at;
+ }
+ }
+ return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *base, void *top,
+ int bit, int count)
+{
+ struct perf_sample_data data;
+ struct pt_regs regs;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+ if (!intel_pmu_save_and_restart(event) &&
+ !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+ return;
+
+ while (count > 1) {
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+ perf_event_output(event, &data, &regs);
+ at += x86_pmu.pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ count--;
+ }
+
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+ /*
+ * All but the last records are processed.
+ * The last one is left to be able to call the overflow handler.
+ */
+ if (perf_event_overflow(event, &data, &regs)) {
x86_pmu_stop(event, 0);
+ return;
+ }
+
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -992,72 +1112,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = top - at;
+ n = (top - at) / x86_pmu.pebs_record_size;
if (n <= 0)
return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
- at += n - 1;
-
- __intel_pmu_pebs_event(event, iregs, at);
+ __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
}
static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
- struct perf_event *event = NULL;
- void *at, *top;
- u64 status = 0;
- int bit;
+ struct perf_event *event;
+ void *base, *at, *top;
+ short counts[MAX_PEBS_EVENTS] = {};
+ short error[MAX_PEBS_EVENTS] = {};
+ int bit, i;
if (!x86_pmu.pebs_active)
return;
- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
ds->pebs_index = ds->pebs_buffer_base;
- if (unlikely(at > top))
+ if (unlikely(base >= top))
return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
- "Unexpected number of pebs records %ld\n",
- (long)(top - at) / x86_pmu.pebs_record_size);
-
- for (; at < top; at += x86_pmu.pebs_record_size) {
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;
- for_each_set_bit(bit, (unsigned long *)&p->status,
- x86_pmu.max_pebs_events) {
- event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- WARN_ON_ONCE(!event);
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3) {
+ for_each_set_bit(bit, (unsigned long *)&p->status,
+ MAX_PEBS_EVENTS)
+ counts[bit]++;
- if (!event->attr.precise_ip)
- continue;
+ continue;
+ }
- if (__test_and_set_bit(bit, (unsigned long *)&status))
+ bit = find_first_bit((unsigned long *)&p->status,
+ x86_pmu.max_pebs_events);
+ if (bit >= x86_pmu.max_pebs_events)
+ continue;
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+ /*
+ * The PEBS hardware does not deal well with the situation
+ * when events happen near to each other and multiple bits
+ * are set. But it should happen rarely.
+ *
+ * If these events include one PEBS and multiple non-PEBS
+ * events, it doesn't impact PEBS record. The record will
+ * be handled normally. (slow path)
+ *
+ * If these events include two or more PEBS events, the
+ * records for the events can be collapsed into a single
+ * one, and it's not possible to reconstruct all events
+ * that caused the PEBS record. It's called collision.
+ * If collision happened, the record will be dropped.
+ *
+ */
+ if (p->status != (1 << bit)) {
+ u64 pebs_status;
+
+ /* slow path */
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+ if (pebs_status != (1 << bit)) {
+ for_each_set_bit(i, (unsigned long *)&pebs_status,
+ MAX_PEBS_EVENTS)
+ error[i]++;
continue;
-
- break;
+ }
}
+ counts[bit]++;
+ }
- if (!event || bit >= x86_pmu.max_pebs_events)
+ for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+ if ((counts[bit] == 0) && (error[bit] == 0))
continue;
+ event = cpuc->events[bit];
+ WARN_ON_ONCE(!event);
+ WARN_ON_ONCE(!event->attr.precise_ip);
- __intel_pmu_pebs_event(event, iregs, at);
+ /* log dropped samples number */
+ if (error[bit])
+ perf_log_lost_samples(event, error[bit]);
+
+ if (counts[bit]) {
+ __intel_pmu_pebs_event(event, iregs, base,
+ top, bit, counts[bit]);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 94e5b506caa6..452a7bd2dedb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -96,6 +96,7 @@ enum {
X86_BR_NO_TX = 1 << 14,/* not in transaction */
X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
X86_BR_CALL_STACK = 1 << 16,/* call stack */
+ X86_BR_IND_JMP = 1 << 17,/* indirect jump */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -113,6 +114,7 @@ enum {
X86_BR_IRQ |\
X86_BR_ABORT |\
X86_BR_IND_CALL |\
+ X86_BR_IND_JMP |\
X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -262,9 +264,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
- if (!x86_pmu.lbr_nr)
- return;
-
/*
* If LBR callstack feature is enabled and the stack was saved when
* the task was scheduled out, restore the stack. Otherwise flush
@@ -523,6 +522,9 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
X86_BR_CALL_STACK;
}
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -736,7 +738,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
break;
case 4:
case 5:
- ret = X86_BR_JMP;
+ ret = X86_BR_IND_JMP;
break;
}
break;
@@ -844,6 +846,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
*/
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -856,6 +859,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
};
static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -870,6 +874,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
| LBR_RETURN | LBR_CALL_STACK,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
};
/* core */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index ffe666c2c6b5..159887c3a89d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void)
de_attr->attr.attr.name = pt_caps[i].name;
- sysfs_attr_init(&de_attrs->attr.attr);
+ sysfs_attr_init(&de_attr->attr.attr);
de_attr->attr.attr.mode = S_IRUGO;
de_attr->attr.show = pt_cap_show;
@@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event)
* These all are cpu affine and operate on a local PT
*/
-static bool pt_is_running(void)
-{
- u64 ctl;
-
- rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
- return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
static void pt_config(struct perf_event *event)
{
u64 reg;
@@ -609,16 +600,19 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
* @handle: Current output handle.
*
* Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
*/
static int pt_buffer_reset_markers(struct pt_buffer *buf,
struct perf_output_handle *handle)
{
- unsigned long idx, npages, end;
-
- if (buf->snapshot)
- return 0;
+ unsigned long head = local64_read(&buf->head);
+ unsigned long idx, npages, wakeup;
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 <
@@ -634,17 +628,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
buf->topa_index[buf->stop_pos]->stop = 0;
buf->topa_index[buf->intr_pos]->intr = 0;
- if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
- npages = (handle->size + 1) >> PAGE_SHIFT;
- end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
- /*if (end > handle->wakeup >> PAGE_SHIFT)
- end = handle->wakeup >> PAGE_SHIFT;*/
- idx = end & (buf->nr_pages - 1);
- buf->stop_pos = idx;
- idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
- idx &= buf->nr_pages - 1;
- buf->intr_pos = idx;
- }
+ /* how many pages till the STOP marker */
+ npages = handle->size >> PAGE_SHIFT;
+
+ /* if it's on a page boundary, fill up one more page */
+ if (!offset_in_page(head + handle->size + 1))
+ npages++;
+
+ idx = (head >> PAGE_SHIFT) + npages;
+ idx &= buf->nr_pages - 1;
+ buf->stop_pos = idx;
+
+ wakeup = handle->wakeup >> PAGE_SHIFT;
+
+ /* in the worst case, wake up the consumer one page before hard stop */
+ idx = (head >> PAGE_SHIFT) + npages - 1;
+ if (idx > wakeup)
+ idx = wakeup;
+
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
buf->topa_index[buf->stop_pos]->stop = 1;
buf->topa_index[buf->intr_pos]->intr = 1;
@@ -664,7 +667,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
struct topa *cur = buf->first, *prev = buf->last;
struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
*te_prev = TOPA_ENTRY(prev, prev->last - 1);
- int pg = 0, idx = 0, ntopa = 0;
+ int pg = 0, idx = 0;
while (pg < buf->nr_pages) {
int tidx;
@@ -679,9 +682,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
/* advance to next topa table */
idx = 0;
cur = list_entry(cur->list.next, struct topa, list);
- ntopa++;
- } else
+ } else {
idx++;
+ }
te_cur = TOPA_ENTRY(cur, idx);
}
@@ -693,7 +696,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
* @head: Write pointer (aux_head) from AUX buffer.
*
* Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
*/
static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
{
@@ -891,6 +901,7 @@ void intel_pt_interrupt(void)
}
pt_buffer_reset_offsets(buf, pt->handle.head);
+ /* snapshot counters don't use PMI, so it's safe */
ret = pt_buffer_reset_markers(buf, &pt->handle);
if (ret) {
perf_aux_output_end(&pt->handle, 0, true);
@@ -913,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle);
- if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ if (!buf || pt_buffer_is_full(buf, pt)) {
event->hw.state = PERF_HES_STOPPED;
return;
}
@@ -944,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
event->hw.state = PERF_HES_STOPPED;
if (mode & PERF_EF_UPDATE) {
- struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle);
if (!buf)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index c635b8b49e93..7c1de1610178 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- hwc = &box->event_list[i]->hw;
c = uncore_get_event_constraint(box, box->event_list[i]);
- hwc->constraint = c;
+ box->event_constraint[i] = c;
wmin = min(wmin, c->weight);
wmax = max(wmax, c->weight);
}
@@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
/* fastpath, try to reuse previous register */
for (i = 0; i < n; i++) {
hwc = &box->event_list[i]->hw;
- c = hwc->constraint;
+ c = box->event_constraint[i];
/* never assigned */
if (hwc->idx == -1)
@@ -395,8 +394,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
}
/* slow path */
if (i != n)
- ret = perf_assign_events(box->event_list, n,
- wmin, wmax, assign);
+ ret = perf_assign_events(box->event_constraint, n,
+ wmin, wmax, n, assign);
if (!assign || ret) {
for (i = 0; i < n; i++)
@@ -840,6 +839,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
box->phys_id = phys_id;
box->pci_dev = pdev;
box->pmu = pmu;
+ uncore_box_init(box);
pci_set_drvdata(pdev, box);
raw_spin_lock(&uncore_box_lock);
@@ -922,6 +922,9 @@ static int __init uncore_pci_init(void)
case 69: /* Haswell Celeron */
ret = hsw_uncore_pci_init();
break;
+ case 61: /* Broadwell */
+ ret = bdw_uncore_pci_init();
+ break;
default:
return 0;
}
@@ -1003,8 +1006,10 @@ static int uncore_cpu_starting(int cpu)
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
/* called by uncore_cpu_init? */
- if (box && box->phys_id >= 0)
+ if (box && box->phys_id >= 0) {
+ uncore_box_init(box);
continue;
+ }
for_each_online_cpu(k) {
exist = *per_cpu_ptr(pmu->box, k);
@@ -1020,8 +1025,10 @@ static int uncore_cpu_starting(int cpu)
}
}
- if (box)
+ if (box) {
box->phys_id = phys_id;
+ uncore_box_init(box);
+ }
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 6c8c1e7e69d8..0f77f0a196e4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -97,6 +97,7 @@ struct intel_uncore_box {
atomic_t refcnt;
struct perf_event *events[UNCORE_PMC_IDX_MAX];
struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+ struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
u64 tags[UNCORE_PMC_IDX_MAX];
struct pci_dev *pci_dev;
@@ -257,14 +258,6 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
return box->pmu->type->num_counters;
}
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
- if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
- if (box->pmu->type->ops->init_box)
- box->pmu->type->ops->init_box(box);
- }
-}
-
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
@@ -273,8 +266,6 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
static inline void uncore_enable_box(struct intel_uncore_box *box)
{
- uncore_box_init(box);
-
if (box->pmu->type->ops->enable_box)
box->pmu->type->ops->enable_box(box);
}
@@ -297,6 +288,14 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
return box->pmu->type->ops->read_counter(box, event);
}
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
@@ -326,6 +325,7 @@ extern struct event_constraint uncore_constraint_empty;
int snb_uncore_pci_init(void);
int ivb_uncore_pci_init(void);
int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 4562e9e22c60..b005a78c7012 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
{ /* end: all zeroes */ },
};
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
static struct pci_driver snb_uncore_pci_driver = {
.name = "snb_uncore",
.id_table = snb_uncore_pci_ids,
@@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
.id_table = hsw_uncore_pci_ids,
};
+static struct pci_driver bdw_uncore_pci_driver = {
+ .name = "bdw_uncore",
+ .id_table = bdw_uncore_pci_ids,
+};
+
struct imc_uncore_pci_dev {
__u32 pci_id;
struct pci_driver *driver;
@@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
+ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
{ /* end marker */ }
};
@@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void)
return imc_uncore_pci_init();
}
+int bdw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 12d9548457e7..6d6e85dd5849 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -164,8 +164,8 @@
((1ULL << (n)) - 1)))
/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0 0x705
-#define HSWEP_U_MSR_PMON_CTL0 0x709
+#define HSWEP_U_MSR_PMON_CTR0 0x709
+#define HSWEP_U_MSR_PMON_CTL0 0x705
#define HSWEP_U_MSR_PMON_FILTER 0x707
#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703
@@ -1914,7 +1914,7 @@ static struct intel_uncore_type hswep_uncore_cbox = {
.name = "cbox",
.num_counters = 4,
.num_boxes = 18,
- .perf_ctr_bits = 44,
+ .perf_ctr_bits = 48,
.event_ctl = HSWEP_C0_MSR_PMON_CTL0,
.perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
.event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2b55ee6db053..5a4668136e98 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handlers[i]);
+ set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d031bad9e07e..53eeb226657c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -478,21 +478,22 @@ is486:
__INIT
setup_once:
/*
- * Set up a idt with 256 entries pointing to ignore_int,
- * interrupt gates. It doesn't actually load idt - that needs
- * to be done on each CPU. Interrupts are enabled elsewhere,
- * when we can be relatively sure everything is ok.
+ * Set up a idt with 256 interrupt gates that push zero if there
+ * is no error code and then jump to early_idt_handler_common.
+ * It doesn't actually load the idt - that needs to be done on
+ * each CPU. Interrupts are enabled elsewhere, when we can be
+ * relatively sure everything is ok.
*/
movl $idt_table,%edi
- movl $early_idt_handlers,%eax
+ movl $early_idt_handler_array,%eax
movl $NUM_EXCEPTION_VECTORS,%ecx
1:
movl %eax,(%edi)
movl %eax,4(%edi)
/* interrupt gate, dpl=0, present */
movl $(0x8E000000 + __KERNEL_CS),2(%edi)
- addl $9,%eax
+ addl $EARLY_IDT_HANDLER_SIZE,%eax
addl $8,%edi
loop 1b
@@ -524,26 +525,28 @@ setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
ret
-ENTRY(early_idt_handlers)
+ENTRY(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
# 28(%esp) %eip
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handlers)
+ENDPROC(early_idt_handler_array)
- /* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%esp) # X86_TRAP_NMI
@@ -603,7 +606,7 @@ ex_entry:
is_nmi:
addl $8,%esp /* drop vector number and error code */
iret
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ae6588b301c2..df7e78057ae0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -321,26 +321,28 @@ bad_address:
jmp bad_address
__INIT
- .globl early_idt_handlers
-early_idt_handlers:
+ENTRY(early_idt_handler_array)
# 104(%rsp) %rflags
# 96(%rsp) %cs
# 88(%rsp) %rip
# 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushq $0 # Dummy error code, to make stack frame uniform
.endif
pushq $i # 72(%rsp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
+ENDPROC(early_idt_handler_array)
-/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%rsp) # X86_TRAP_NMI
@@ -412,7 +414,7 @@ ENTRY(early_idt_handler)
is_nmi:
addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
__INITDATA
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df..1681504e44a4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ if (READ_ONCE(*ptr) != val)
+ goto out;
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
enum kvm_contention_stat {
TAKEN_SLOW,
TAKEN_SLOW_PICKUP,
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
}
}
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = kvm_wait;
+ pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316341..33ee3e0efd65 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
#include <asm/paravirt.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+ native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_lock_ops.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .wait = paravirt_nop,
+ .kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
.unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab6..e1b013696dde 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
+
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
/* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
return 0;
}
+extern bool pv_is_native_spin_unlock(void);
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_cpu_ops, read_tsc);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+#endif
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
}
#undef PATCH_SITE
return ret;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index a1da6737ba5b..a1fa86782186 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
start__mov64, end__mov64);
}
+extern bool pv_is_native_spin_unlock(void);
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
@@ -59,14 +65,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+#endif
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
}
#undef PATCH_SITE
return ret;