Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar: "The main changes in this cycle were: Kernel side changes: - Kprobes and uprobes changes: - Make their trampolines read-only while they are used - Make UPROBES_EVENTS default-y which is the distro practice - Apply misc fixes and robustization to probe point insertion. - add support for AMD IOMMU events - extend hw events on Intel Goldmont CPUs - ... plus misc fixes and updates. Tooling side changes: - support s390 jump instructions in perf annotate (Christian Borntraeger) - vendor hardware events updates (Andi Kleen) - add argument support for SDT events in powerpc (Ravi Bangoria) - beautify the statx syscall arguments in 'perf trace' (Arnaldo Carvalho de Melo) - handle inline functions in callchains (Jin Yao) - enable sorting by srcline as key (Milian Wolff) - add 'brstackinsn' field in 'perf script' to reuse the x86 instruction decoder used in the Intel PT code to study hot paths to samples (Andi Kleen) - add PERF_RECORD_NAMESPACES so that the kernel can record information required to associate samples to namespaces, helping in container problem characterization. (Hari Bathini) - allow sorting by symbol_size in 'perf report' and 'perf top' (Charles Baylis) - in perf stat, make system wide (-a) the default option if no target was specified and one of following conditions is met: - no workload specified (current behaviour) - a workload is specified but all requested events are system wide ones, like uncore ones. (Jiri Olsa) - ... plus lots of other updates, enhancements, cleanups and fixes" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (235 commits) perf tools: Fix the code to strip command name tools arch x86: Sync cpufeatures.h tools arch: Sync arch/x86/lib/memcpy_64.S with the kernel tools: Update asm-generic/mman-common.h copy from the kernel perf tools: Use just forward declarations for struct thread where possible perf tools: Add the right header to obtain PERF_ALIGN() perf tools: Remove poll.h and wait.h from util.h perf tools: Remove string.h, unistd.h and sys/stat.h from util.h perf tools: Remove stale prototypes from builtin.h perf tools: Remove string.h from util.h perf tools: Remove sys/ioctl.h from util.h perf tools: Remove a few more needless includes from util.h perf tools: Include sys/param.h where needed perf callchain: Move callchain specific routines from util.[ch] perf tools: Add compress.h for the *_decompress_to_file() headers perf mem: Fix display of data source snoop indication perf debug: Move dump_stack() and sighandler_dump_stack() to debug.h perf kvm: Make function only used by 'perf kvm' static perf tools: Move timestamp routines from util.h to time-utils.h perf tools: Move units conversion/formatting routines to separate object ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-01 20:23:17 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-01 20:23:17 -0700
commit: 7c8c03bfc7b9f5211d8a69eab7fee99c9fb4f449 (patch)
tree: a5cee67325e50e893bf0cc0a0d060983a0df6653 /arch
parent: 6dc2cce9321198172cd96f955a5fc798a4cc35a6 (diff)
parent: fd7647979a3948dae4fc6f25dbbdf9ba269bed78 (diff)
14 files changed, 423 insertions, 346 deletions
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index b28200dea715..3641e24fdac5 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -11,6 +11,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt)	"perf/amd_iommu: " fmt
+
 #include <linux/perf_event.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
@@ -21,44 +23,42 @@
 
 #define COUNTER_SHIFT		16
 
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x)     ((x)->conf & 0xFFULL)
+#define GET_DEVID(x)       (((x)->conf >> 8)  & 0xFFFFULL)
+#define GET_DOMID(x)       (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x)       (((x)->conf >> 40) & 0xFFFFFULL)
 
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x)  ((x)->conf1  & 0xFFFFULL)
+#define GET_DOMID_MASK(x)  (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x)  (((x)->conf1 >> 32) & 0xFFFFFULL)
 
-static struct perf_amd_iommu __perf_iommu;
+#define IOMMU_NAME_SIZE 16
 
 struct perf_amd_iommu {
+	struct list_head list;
 	struct pmu pmu;
+	struct amd_iommu *iommu;
+	char name[IOMMU_NAME_SIZE];
 	u8 max_banks;
 	u8 max_counters;
 	u64 cntr_assign_mask;
 	raw_spinlock_t lock;
-	const struct attribute_group *attr_groups[4];
 };
 
-#define format_group	attr_groups[0]
-#define cpumask_group	attr_groups[1]
-#define events_group	attr_groups[2]
-#define null_group	attr_groups[3]
+static LIST_HEAD(perf_amd_iommu_list);
 
 /*---------------------------------------------
  * sysfs format attributes
  *---------------------------------------------*/
 PMU_FORMAT_ATTR(csource,    "config:0-7");
 PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(domid,      "config:24-39");
+PMU_FORMAT_ATTR(pasid,      "config:40-59");
 PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
 
 static struct attribute *iommu_format_attrs[] = {
 	&format_attr_csource.attr,
@@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = {
 /*---------------------------------------------
  * sysfs events attributes
  *---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+	.name = "events",
+};
+
 struct amd_iommu_event_desc {
 	struct kobj_attribute attr;
 	const char *event;
@@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
 
 /*---------------------------------------------*/
 
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
 {
+	struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+	int max_cntrs = piommu->max_counters;
+	int max_banks = piommu->max_banks;
+	u32 shift, bank, cntr;
 	unsigned long flags;
-	int shift, bank, cntr, retval;
-	int max_banks = perf_iommu->max_banks;
-	int max_cntrs = perf_iommu->max_counters;
+	int retval;
 
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	raw_spin_lock_irqsave(&piommu->lock, flags);
 
 	for (bank = 0, shift = 0; bank < max_banks; bank++) {
 		for (cntr = 0; cntr < max_cntrs; cntr++) {
 			shift = bank + (bank*3) + cntr;
-			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+			if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
 				continue;
 			} else {
-				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+				piommu->cntr_assign_mask |= BIT_ULL(shift);
+				event->hw.iommu_bank = bank;
+				event->hw.iommu_cntr = cntr;
+				retval = 0;
 				goto out;
 			}
 		}
 	}
 	retval = -ENOSPC;
 out:
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	raw_spin_unlock_irqrestore(&piommu->lock, flags);
 	return retval;
 }
 
@@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
 static int perf_iommu_event_init(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_amd_iommu *perf_iommu;
-	u64 config, config1;
 
 	/* test the event attr type check for PMU enumeration */
 	if (event->attr.type != event->pmu->type)
@@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	perf_iommu = &__perf_iommu;
-
-	if (event->pmu != &perf_iommu->pmu)
-		return -ENOENT;
-
-	if (perf_iommu) {
-		config = event->attr.config;
-		config1 = event->attr.config1;
-	} else {
-		return -EINVAL;
-	}
-
-	/* integrate with iommu base devid (0000), assume one iommu */
-	perf_iommu->max_banks =
-		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-	perf_iommu->max_counters =
-		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-		return -EINVAL;
-
 	/* update the hw_perf_event struct with the iommu config data */
-	hwc->config = config;
-	hwc->extra_reg.config = config1;
+	hwc->conf  = event->attr.config;
+	hwc->conf1 = event->attr.config1;
 
 	return 0;
 }
 
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+	return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
 static void perf_iommu_enable_event(struct perf_event *ev)
 {
-	u8 csource = _GET_CSOURCE(ev);
-	u16 devid = _GET_DEVID(ev);
+	struct amd_iommu *iommu = perf_event_2_iommu(ev);
+	struct hw_perf_event *hwc = &ev->hw;
+	u8 bank = hwc->iommu_bank;
+	u8 cntr = hwc->iommu_cntr;
 	u64 reg = 0ULL;
 
-	reg = csource;
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+	reg = GET_CSOURCE(hwc);
+	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
 
-	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+	reg = GET_DEVID_MASK(hwc);
+	reg = GET_DEVID(hwc) | (reg << 32);
 	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+		reg |= BIT(31);
+	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
 
-	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	reg = GET_PASID_MASK(hwc);
+	reg = GET_PASID(hwc) | (reg << 32);
 	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
+		reg |= BIT(31);
+	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
 
-	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	reg = GET_DOMID_MASK(hwc);
+	reg = GET_DOMID(hwc) | (reg << 32);
 	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+		reg |= BIT(31);
+	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
 }
 
 static void perf_iommu_disable_event(struct perf_event *event)
 {
+	struct amd_iommu *iommu = perf_event_2_iommu(event);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 reg = 0ULL;
 
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-			_GET_BANK(event), _GET_CNTR(event),
-			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+	amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+			     IOMMU_PC_COUNTER_SRC_REG, &reg);
 }
 
 static void perf_iommu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	pr_debug("perf: amd_iommu:perf_iommu_start\n");
 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 		return;
 
@@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags)
 	hwc->state = 0;
 
 	if (flags & PERF_EF_RELOAD) {
-		u64 prev_raw_count =  local64_read(&hwc->prev_count);
-		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+		u64 prev_raw_count = local64_read(&hwc->prev_count);
+		struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+		amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+				     IOMMU_PC_COUNTER_REG, &prev_raw_count);
 	}
 
 	perf_iommu_enable_event(event);
@@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags)
 
 static void perf_iommu_read(struct perf_event *event)
 {
-	u64 count = 0ULL;
-	u64 prev_raw_count = 0ULL;
-	u64 delta = 0ULL;
+	u64 count, prev, delta;
 	struct hw_perf_event *hwc = &event->hw;
-	pr_debug("perf: amd_iommu:perf_iommu_read\n");
+	struct amd_iommu *iommu = perf_event_2_iommu(event);
 
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &count, false);
+	if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+				 IOMMU_PC_COUNTER_REG, &count))
+		return;
 
 	/* IOMMU pc counter register is only 48 bits */
-	count &= 0xFFFFFFFFFFFFULL;
+	count &= GENMASK_ULL(47, 0);
 
-	prev_raw_count =  local64_read(&hwc->prev_count);
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					count) != prev_raw_count)
+	prev = local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
 		return;
 
-	/* Handling 48-bit counter overflowing */
-	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+	/* Handle 48-bit counter overflow */
+	delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
 	delta >>= COUNTER_SHIFT;
 	local64_add(delta, &event->count);
-
 }
 
 static void perf_iommu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
 
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
@@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
 
-	config = hwc->config;
 	perf_iommu_read(event);
 	hwc->state |= PERF_HES_UPTODATE;
 }
@@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 static int perf_iommu_add(struct perf_event *event, int flags)
 {
 	int retval;
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
-	pr_debug("perf: amd_iommu:perf_iommu_add\n");
 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	/* request an iommu bank/counter */
-	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-	if (retval != -ENOSPC)
-		event->hw.extra_reg.reg = (u16)retval;
-	else
+	retval = get_next_avail_iommu_bnk_cntr(event);
+	if (retval)
 		return retval;
 
 	if (flags & PERF_EF_START)
@@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags)
 
 static void perf_iommu_del(struct perf_event *event, int flags)
 {
+	struct hw_perf_event *hwc = &event->hw;
 	struct perf_amd_iommu *perf_iommu =
 			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
-	pr_debug("perf: amd_iommu:perf_iommu_del\n");
 	perf_iommu_stop(event, PERF_EF_UPDATE);
 
 	/* clear the assigned iommu bank/counter */
 	clear_avail_iommu_bnk_cntr(perf_iommu,
-				     _GET_BANK(event),
-				     _GET_CNTR(event));
+				   hwc->iommu_bank, hwc->iommu_cntr);
 
 	perf_event_update_userpage(event);
 }
 
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+static __init int _init_events_attrs(void)
 {
-	struct attribute **attrs;
-	struct attribute_group *attr_group;
 	int i = 0, j;
+	struct attribute **attrs;
 
 	while (amd_iommu_v2_event_descs[i].attr.attr.name)
 		i++;
 
-	attr_group = kzalloc(sizeof(struct attribute *)
-		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-	if (!attr_group)
+	attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL);
+	if (!attrs)
 		return -ENOMEM;
 
-	attrs = (struct attribute **)(attr_group + 1);
 	for (j = 0; j < i; j++)
 		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
 
-	attr_group->name = "events";
-	attr_group->attrs = attrs;
-	perf_iommu->events_group = attr_group;
-
+	amd_iommu_events_group.attrs = attrs;
 	return 0;
 }
 
-static __init void amd_iommu_pc_exit(void)
-{
-	if (__perf_iommu.events_group != NULL) {
-		kfree(__perf_iommu.events_group);
-		__perf_iommu.events_group = NULL;
-	}
-}
+const struct attribute_group *amd_iommu_attr_groups[] = {
+	&amd_iommu_format_group,
+	&amd_iommu_cpumask_group,
+	&amd_iommu_events_group,
+	NULL,
+};
+
+static struct pmu iommu_pmu = {
+	.event_init	= perf_iommu_event_init,
+	.add		= perf_iommu_add,
+	.del		= perf_iommu_del,
+	.start		= perf_iommu_start,
+	.stop		= perf_iommu_stop,
+	.read		= perf_iommu_read,
+	.task_ctx_nr	= perf_invalid_context,
+	.attr_groups	= amd_iommu_attr_groups,
+};
 
-static __init int _init_perf_amd_iommu(
-	struct perf_amd_iommu *perf_iommu, char *name)
+static __init int init_one_iommu(unsigned int idx)
 {
+	struct perf_amd_iommu *perf_iommu;
 	int ret;
 
+	perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+	if (!perf_iommu)
+		return -ENOMEM;
+
 	raw_spin_lock_init(&perf_iommu->lock);
 
-	/* Init format attributes */
-	perf_iommu->format_group = &amd_iommu_format_group;
+	perf_iommu->pmu          = iommu_pmu;
+	perf_iommu->iommu        = get_amd_iommu(idx);
+	perf_iommu->max_banks    = amd_iommu_pc_get_max_banks(idx);
+	perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
 
-	/* Init cpumask attributes to only core 0 */
-	cpumask_set_cpu(0, &iommu_cpumask);
-	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-	/* Init events attributes */
-	if (_init_events_attrs(perf_iommu) != 0)
-		pr_err("perf: amd_iommu: Only support raw events.\n");
+	if (!perf_iommu->iommu ||
+	    !perf_iommu->max_banks ||
+	    !perf_iommu->max_counters) {
+		kfree(perf_iommu);
+		return -EINVAL;
+	}
 
-	/* Init null attributes */
-	perf_iommu->null_group = NULL;
-	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+	snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
 
-	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-	if (ret) {
-		pr_err("perf: amd_iommu: Failed to initialized.\n");
-		amd_iommu_pc_exit();
+	ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+	if (!ret) {
+		pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+			idx, perf_iommu->max_banks, perf_iommu->max_counters);
+		list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
 	} else {
-		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+		pr_warn("Error initializing IOMMU %d.\n", idx);
+		kfree(perf_iommu);
 	}
-
 	return ret;
 }
 
-static struct perf_amd_iommu __perf_iommu = {
-	.pmu = {
-		.task_ctx_nr    = perf_invalid_context,
-		.event_init	= perf_iommu_event_init,
-		.add		= perf_iommu_add,
-		.del		= perf_iommu_del,
-		.start		= perf_iommu_start,
-		.stop		= perf_iommu_stop,
-		.read		= perf_iommu_read,
-	},
-	.max_banks		= 0x00,
-	.max_counters		= 0x00,
-	.cntr_assign_mask	= 0ULL,
-	.format_group		= NULL,
-	.cpumask_group		= NULL,
-	.events_group		= NULL,
-	.null_group		= NULL,
-};
-
 static __init int amd_iommu_pc_init(void)
 {
+	unsigned int i, cnt = 0;
+	int ret;
+
 	/* Make sure the IOMMU PC resource is available */
 	if (!amd_iommu_pc_supported())
 		return -ENODEV;
 
-	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+	ret = _init_events_attrs();
+	if (ret)
+		return ret;
+
+	/*
+	 * An IOMMU PMU is specific to an IOMMU, and can function independently.
+	 * So we go through all IOMMUs and ignore the one that fails init
+	 * unless all IOMMU are failing.
+	 */
+	for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+		ret = init_one_iommu(i);
+		if (!ret)
+			cnt++;
+	}
+
+	if (!cnt) {
+		kfree(amd_iommu_events_group.attrs);
+		return -ENODEV;
+	}
 
+	/* Init cpumask attributes to only core 0 */
+	cpumask_set_cpu(0, &iommu_cpumask);
 	return 0;
 }
 
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 845d173278e3..62e0702c4374 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -24,17 +24,23 @@
 #define PC_MAX_SPEC_BNKS			64
 #define PC_MAX_SPEC_CNTRS			16
 
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID			0x0000
+struct amd_iommu;
 
 /* amd_iommu_init.c external support functions */
+extern int amd_iommu_get_num_iommus(void);
+
 extern bool amd_iommu_pc_supported(void);
 
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+				u8 fxn, u64 *value);
 
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+				u8 fxn, u64 *value);
 
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-			u8 fxn, u64 *value, bool is_write);
+extern struct amd_iommu *get_amd_iommu(int idx);
 
 #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 4d1f7f2d9aff..ad44af0dd667 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -30,6 +30,9 @@
 
 #define COUNTER_SHIFT		16
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"amd_uncore: " fmt
+
 static int num_counters_llc;
 static int num_counters_nb;
 
@@ -509,51 +512,34 @@ static int __init amd_uncore_init(void)
 	int ret = -ENODEV;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-		goto fail_nodev;
-
-	switch(boot_cpu_data.x86) {
-		case 23:
-			/* Family 17h: */
-			num_counters_nb = NUM_COUNTERS_NB;
-			num_counters_llc = NUM_COUNTERS_L3;
-			/*
-			 * For Family17h, the NorthBridge counters are
-			 * re-purposed as Data Fabric counters. Also, support is
-			 * added for L3 counters. The pmus are exported based on
-			 * family as either L2 or L3 and NB or DF.
-			 */
-			amd_nb_pmu.name = "amd_df";
-			amd_llc_pmu.name = "amd_l3";
-			format_attr_event_df.show = &event_show_df;
-			format_attr_event_l3.show = &event_show_l3;
-			break;
-		case 22:
-			/* Family 16h - may change: */
-			num_counters_nb = NUM_COUNTERS_NB;
-			num_counters_llc = NUM_COUNTERS_L2;
-			amd_nb_pmu.name = "amd_nb";
-			amd_llc_pmu.name = "amd_l2";
-			format_attr_event_df = format_attr_event;
-			format_attr_event_l3 = format_attr_event;
-			break;
-		default:
-			/*
-			 * All prior families have the same number of
-			 * NorthBridge and Last Level Cache counters
-			 */
-			num_counters_nb = NUM_COUNTERS_NB;
-			num_counters_llc = NUM_COUNTERS_L2;
-			amd_nb_pmu.name = "amd_nb";
-			amd_llc_pmu.name = "amd_l2";
-			format_attr_event_df = format_attr_event;
-			format_attr_event_l3 = format_attr_event;
-			break;
-	}
-	amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
-	amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
+		return -ENODEV;
 
 	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-		goto fail_nodev;
+		return -ENODEV;
+
+	if (boot_cpu_data.x86 == 0x17) {
+		/*
+		 * For F17h, the Northbridge counters are repurposed as Data
+		 * Fabric counters. Also, L3 counters are supported too. The PMUs
+		 * are exported based on  family as either L2 or L3 and NB or DF.
+		 */
+		num_counters_nb		  = NUM_COUNTERS_NB;
+		num_counters_llc	  = NUM_COUNTERS_L3;
+		amd_nb_pmu.name		  = "amd_df";
+		amd_llc_pmu.name	  = "amd_l3";
+		format_attr_event_df.show = &event_show_df;
+		format_attr_event_l3.show = &event_show_l3;
+	} else {
+		num_counters_nb		  = NUM_COUNTERS_NB;
+		num_counters_llc	  = NUM_COUNTERS_L2;
+		amd_nb_pmu.name		  = "amd_nb";
+		amd_llc_pmu.name	  = "amd_l2";
+		format_attr_event_df	  = format_attr_event;
+		format_attr_event_l3	  = format_attr_event;
+	}
+
+	amd_nb_pmu.attr_groups	= amd_uncore_attr_groups_df;
+	amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
 
 	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
 		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
@@ -565,7 +551,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_nb;
 
-		pr_info("perf: AMD NB counters detected\n");
+		pr_info("AMD NB counters detected\n");
 		ret = 0;
 	}
 
@@ -579,7 +565,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_llc;
 
-		pr_info("perf: AMD LLC counters detected\n");
+		pr_info("AMD LLC counters detected\n");
 		ret = 0;
 	}
 
@@ -615,7 +601,6 @@ fail_nb:
 	if (amd_uncore_nb)
 		free_percpu(amd_uncore_nb);
 
-fail_nodev:
 	return ret;
 }
 device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 982c9e31daca..8ae8c5ce3a1f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -63,7 +63,6 @@ struct bts_buffer {
 	unsigned int	cur_buf;
 	bool		snapshot;
 	local_t		data_size;
-	local_t		lost;
 	local_t		head;
 	unsigned long	end;
 	void		**data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
 			return;
 
 		if (ds->bts_index >= ds->bts_absolute_maximum)
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&bts->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 
 		/*
 		 * old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&bts->handle, 0, false);
+	perf_aux_output_end(&bts->handle, 0);
 
 fail_stop:
 	event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
 				bts->handle.head =
 					local_xchg(&buf->data_size,
 						   buf->nr_pages << PAGE_SHIFT);
-
-			perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-					    !!local_xchg(&buf->lost, 0));
+			perf_aux_output_end(&bts->handle,
+			                    local_xchg(&buf->data_size, 0));
 		}
 
 		cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
 	if (old_head == local_read(&buf->head))
 		return handled;
 
-	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-			    !!local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 
 	buf = perf_aux_output_begin(&bts->handle, event);
 	if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
 			 * cleared handle::event
 			 */
 			barrier();
-			perf_aux_output_end(&bts->handle, 0, false);
+			perf_aux_output_end(&bts->handle, 0);
 		}
 	}
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index eb1484c86bb4..a6d91d4e37a1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+	EVENT_PTR(td_total_slots_glm),
+	EVENT_PTR(td_total_slots_scale_glm),
+	EVENT_PTR(td_fetch_bubbles_glm),
+	EVENT_PTR(td_recovery_bubbles_glm),
+	EVENT_PTR(td_slots_issued_glm),
+	EVENT_PTR(td_slots_retired_glm),
+	NULL
+};
+
 static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
@@ -2130,7 +2151,7 @@ again:
 	 * counters from the GLOBAL_STATUS mask and we always process PEBS
 	 * events via drain_pebs().
 	 */
-	status &= ~cpuc->pebs_enabled;
+	status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
 
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
@@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_prec_dist = true;
 		x86_pmu.lbr_pt_coexist = true;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.cpu_events = glm_events_attrs;
 		pr_cont("Goldmont events, ");
 		break;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9dfeeeca0ea8..c6d23ffe422d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1222,7 +1222,7 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 
 			/* clear non-PEBS bit and re-check */
 			pebs_status = p->status & cpuc->pebs_enabled;
-			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			pebs_status &= PEBS_COUNTER_MASK;
 			if (pebs_status == (1 << bit))
 				return at;
 		}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 5900471ee508..ae8324d65e61 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -28,6 +28,7 @@
 #include <asm/insn.h>
 #include <asm/io.h>
 #include <asm/intel_pt.h>
+#include <asm/intel-family.h>
 
 #include "../perf_event.h"
 #include "pt.h"
@@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = {
 	.name	= "caps",
 };
 
+PMU_FORMAT_ATTR(pt,		"config:0"	);
 PMU_FORMAT_ATTR(cyc,		"config:1"	);
 PMU_FORMAT_ATTR(pwr_evt,	"config:4"	);
 PMU_FORMAT_ATTR(fup_on_ptw,	"config:5"	);
@@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
 PMU_FORMAT_ATTR(ptw,		"config:12"	);
+PMU_FORMAT_ATTR(branch,		"config:13"	);
 PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
 PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
 PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
 
 static struct attribute *pt_formats_attr[] = {
+	&format_attr_pt.attr,
 	&format_attr_cyc.attr,
 	&format_attr_pwr_evt.attr,
 	&format_attr_fup_on_ptw.attr,
@@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = {
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
 	&format_attr_ptw.attr,
+	&format_attr_branch.attr,
 	&format_attr_mtc_period.attr,
 	&format_attr_cyc_thresh.attr,
 	&format_attr_psb_period.attr,
@@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void)
 		pt_pmu.tsc_art_den = eax;
 	}
 
+	/* model-specific quirks */
+	switch (boot_cpu_data.x86_model) {
+	case INTEL_FAM6_BROADWELL_CORE:
+	case INTEL_FAM6_BROADWELL_XEON_D:
+	case INTEL_FAM6_BROADWELL_GT3E:
+	case INTEL_FAM6_BROADWELL_X:
+		/* not setting BRANCH_EN will #GP, erratum BDM106 */
+		pt_pmu.branch_en_always_on = true;
+		break;
+	default:
+		break;
+	}
+
 	if (boot_cpu_has(X86_FEATURE_VMX)) {
 		/*
 		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,8 +281,20 @@ fail:
 #define RTIT_CTL_PTW	(RTIT_CTL_PTW_EN	| \
 			 RTIT_CTL_FUP_ON_PTW)
 
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN	| \
+			RTIT_CTL_TSC_EN		| \
 			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_BRANCH_EN	| \
 			RTIT_CTL_CYC_PSB	| \
 			RTIT_CTL_MTC		| \
 			RTIT_CTL_PWR_EVT_EN	| \
@@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event)
 			return false;
 	}
 
+	/*
+	 * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+	 * clears the assomption that BranchEn must always be enabled,
+	 * as was the case with the first implementation of PT.
+	 * If this bit is not set, the legacy behavior is preserved
+	 * for compatibility with the older userspace.
+	 *
+	 * Re-using bit 0 for this purpose is fine because it is never
+	 * directly set by the user; previous attempts at setting it in
+	 * the attr.config resulted in -EINVAL.
+	 */
+	if (config & RTIT_CTL_PASSTHROUGH) {
+		/*
+		 * Disallow not setting BRANCH_EN where BRANCH_EN is
+		 * always required.
+		 */
+		if (pt_pmu.branch_en_always_on &&
+		    !(config & RTIT_CTL_BRANCH_EN))
+			return false;
+	} else {
+		/*
+		 * Disallow BRANCH_EN without the PASSTHROUGH.
+		 */
+		if (config & RTIT_CTL_BRANCH_EN)
+			return false;
+	}
+
 	return true;
 }
 
@@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event)
 
 static void pt_config(struct perf_event *event)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg;
 
 	if (!event->hw.itrace_started) {
@@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event)
 	}
 
 	reg = pt_config_filters(event);
-	reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+	reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+
+	/*
+	 * Previously, we had BRANCH_EN on by default, but now that PT has
+	 * grown features outside of branch tracing, it is useful to allow
+	 * the user to disable it. Setting bit 0 in the event's attr.config
+	 * allows BRANCH_EN to pass through instead of being always on. See
+	 * also the comment in pt_event_valid().
+	 */
+	if (event->attr.config & BIT(0)) {
+		reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+	} else {
+		reg |= RTIT_CTL_BRANCH_EN;
+	}
 
 	if (!event->attr.exclude_kernel)
 		reg |= RTIT_CTL_OS;
@@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event)
 	reg |= (event->attr.config & PT_CONFIG_MASK);
 
 	event->hw.config = reg;
-	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+	if (READ_ONCE(pt->vmx_on))
+		perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+	else
+		wrmsrl(MSR_IA32_RTIT_CTL, reg);
 }
 
 static void pt_config_stop(struct perf_event *event)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 ctl = READ_ONCE(event->hw.config);
 
 	/* may be already stopped by a PMI */
@@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event)
 		return;
 
 	ctl &= ~RTIT_CTL_TRACEEN;
-	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (!READ_ONCE(pt->vmx_on))
+		wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 
 	WRITE_ONCE(event->hw.config, ctl);
 
@@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt)
 		 */
 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&pt->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 			advance++;
 		}
 	}
@@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
-	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		return -EINVAL;
+	}
 
 
 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void)
 	if (!READ_ONCE(pt->handle_nmi))
 		return;
 
-	/*
-	 * If VMX is on and PT does not support it, don't touch anything.
-	 */
-	if (READ_ONCE(pt->vmx_on))
-		return;
-
 	if (!event)
 		return;
 
@@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void)
 
 	pt_update_head(pt);
 
-	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-			    local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 
 	if (!event->hw.state) {
 		int ret;
@@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void)
 		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
+			perf_aux_output_end(&pt->handle, 0);
 			return;
 		}
 
@@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on)
 	local_irq_save(flags);
 	WRITE_ONCE(pt->vmx_on, on);
 
-	if (on) {
-		/* prevent pt_config_stop() from writing RTIT_CTL */
-		event = pt->handle.event;
-		if (event)
-			event->hw.config = 0;
-	}
+	/*
+	 * If an AUX transaction is in progress, it will contain
+	 * gap(s), so flag it PARTIAL to inform the user.
+	 */
+	event = pt->handle.event;
+	if (event)
+		perf_aux_output_flag(&pt->handle,
+		                     PERF_AUX_FLAG_PARTIAL);
+
+	/* Turn PTs back on */
+	if (!on && event)
+		wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
@@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct pt_buffer *buf;
 
-	if (READ_ONCE(pt->vmx_on))
-		return;
-
 	buf = perf_aux_output_begin(&pt->handle, event);
 	if (!buf)
 		goto fail_stop;
@@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
+	perf_aux_output_end(&pt->handle, 0);
 fail_stop:
 	hwc->state = PERF_HES_STOPPED;
 }
@@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
 			pt->handle.head =
 				local_xchg(&buf->data_size,
 					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-				    local_xchg(&buf->lost, 0));
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 	}
 }
 
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 53473c21b554..0eb41d07b79a 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -110,6 +110,7 @@ struct pt_pmu {
 	struct pmu		pmu;
 	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
 	bool			vmx;
+	bool			branch_en_always_on;
 	unsigned long		max_nonturbo_ratio;
 	unsigned int		tsc_art_num;
 	unsigned int		tsc_art_den;
@@ -143,7 +144,6 @@ struct pt_buffer {
 	size_t			output_off;
 	unsigned long		nr_pages;
 	local_t			data_size;
-	local_t			lost;
 	local64_t		head;
 	bool			snapshot;
 	unsigned long		stop_pos, intr_pos;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index bcbb1d2ae10b..be3d36254040 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -79,6 +79,7 @@ struct amd_nb {
 
 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS		8
+#define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 
 /*
  * Flags PEBS can handle without an PMI.
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 200581691c6e..34b984c60790 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -72,14 +72,13 @@ struct arch_specific_insn {
 	/* copy of the original instruction */
 	kprobe_opcode_t *insn;
 	/*
-	 * boostable = -1: This instruction type is not boostable.
-	 * boostable = 0: This instruction type is boostable.
-	 * boostable = 1: This instruction has been boosted: we have
+	 * boostable = false: This instruction type is not boostable.
+	 * boostable = true: This instruction has been boosted: we have
 	 * added a relative jump after the instruction copy in insn,
 	 * so no single-step and fixup are needed (unless there's
 	 * a post_handler or break_handler).
 	 */
-	int boostable;
+	bool boostable;
 	bool if_modifier;
 };
 
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index d688826e5736..db2182d63ed0 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -67,7 +67,7 @@
 #endif
 
 /* Ensure if the instruction can be boostable */
-extern int can_boost(kprobe_opcode_t *instruction, void *addr);
+extern int can_boost(struct insn *insn, void *orig_addr);
 /* Recover instruction if given address is probed */
 extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
 					 unsigned long addr);
@@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
  * Copy an instruction and adjust the displacement if the instruction
  * uses the %rip-relative addressing mode.
  */
-extern int __copy_instruction(u8 *dest, u8 *src);
+extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn);
 
 /* Generate a relative-jump/call instruction */
 extern void synthesize_reljump(void *from, void *to);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 993fa4fe4f68..19e1f2a6d7b0 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -164,42 +164,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
 NOKPROBE_SYMBOL(skip_prefixes);
 
 /*
- * Returns non-zero if opcode is boostable.
+ * Returns non-zero if INSN is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-int can_boost(kprobe_opcode_t *opcodes, void *addr)
+int can_boost(struct insn *insn, void *addr)
 {
 	kprobe_opcode_t opcode;
-	kprobe_opcode_t *orig_opcodes = opcodes;
 
 	if (search_exception_tables((unsigned long)addr))
 		return 0;	/* Page fault may occur on this address. */
 
-retry:
-	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-		return 0;
-	opcode = *(opcodes++);
-
 	/* 2nd-byte opcode */
-	if (opcode == 0x0f) {
-		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-			return 0;
-		return test_bit(*opcodes,
+	if (insn->opcode.nbytes == 2)
+		return test_bit(insn->opcode.bytes[1],
 				(unsigned long *)twobyte_is_boostable);
-	}
+
+	if (insn->opcode.nbytes != 1)
+		return 0;
+
+	/* Can't boost Address-size override prefix */
+	if (unlikely(inat_is_address_size_prefix(insn->attr)))
+		return 0;
+
+	opcode = insn->opcode.bytes[0];
 
 	switch (opcode & 0xf0) {
-#ifdef CONFIG_X86_64
-	case 0x40:
-		goto retry; /* REX prefix is boostable */
-#endif
 	case 0x60:
-		if (0x63 < opcode && opcode < 0x67)
-			goto retry; /* prefixes */
-		/* can't boost Address-size override and bound */
-		return (opcode != 0x62 && opcode != 0x67);
+		/* can't boost "bound" */
+		return (opcode != 0x62);
 	case 0x70:
 		return 0; /* can't boost conditional jump */
+	case 0x90:
+		return opcode != 0x9a;	/* can't boost call far */
 	case 0xc0:
 		/* can't boost software-interruptions */
 		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
@@ -210,14 +206,9 @@ retry:
 		/* can boost in/out and absolute jmps */
 		return ((opcode & 0x04) || opcode == 0xea);
 	case 0xf0:
-		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-			goto retry; /* lock/rep(ne) prefix */
 		/* clear and set flags are boostable */
 		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
 	default:
-		/* segment override prefixes are boostable */
-		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-			goto retry; /* prefixes */
 		/* CS override prefix and call are not boostable */
 		return (opcode != 0x2e && opcode != 0x9a);
 	}
@@ -264,7 +255,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 	 * Fortunately, we know that the original code is the ideal 5-byte
 	 * long NOP.
 	 */
-	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (probe_kernel_read(buf, (void *)addr,
+		MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+		return 0UL;
+
 	if (faddr)
 		memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
 	else
@@ -276,7 +270,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
- * Returns zero if the instruction can not get recovered.
+ * Returns zero if the instruction can not get recovered (or access failed).
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -348,37 +342,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn)
 }
 
 /*
- * Copy an instruction and adjust the displacement if the instruction
- * uses the %rip-relative addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- * Only applicable to 64-bit x86.
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
+ * This returns the length of copied instruction, or 0 if it has an error.
  */
-int __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
 {
-	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
-	int length;
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
-	if (!recovered_insn)
+	if (!recovered_insn || !insn)
+		return 0;
+
+	/* This can access kernel text if given address is not recovered */
+	if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
 		return 0;
-	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
-	insn_get_length(&insn);
-	length = insn.length;
+
+	kernel_insn_init(insn, dest, MAX_INSN_SIZE);
+	insn_get_length(insn);
 
 	/* Another subsystem puts a breakpoint, failed to recover */
-	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+	if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
-	memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
-	if (insn_rip_relative(&insn)) {
+	/* Only x86_64 has RIP relative instructions */
+	if (insn_rip_relative(insn)) {
 		s64 newdisp;
 		u8 *disp;
-		kernel_insn_init(&insn, dest, length);
-		insn_get_displacement(&insn);
 		/*
 		 * The copied instruction uses the %rip-relative addressing
 		 * mode.  Adjust the displacement for the difference between
@@ -391,36 +384,57 @@ int __copy_instruction(u8 *dest, u8 *src)
 		 * extension of the original signed 32-bit displacement would
 		 * have given.
 		 */
-		newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
+		newdisp = (u8 *) src + (s64) insn->displacement.value
+			  - (u8 *) dest;
 		if ((s64) (s32) newdisp != newdisp) {
 			pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
-			pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
+			pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
+				src, dest, insn->displacement.value);
 			return 0;
 		}
-		disp = (u8 *) dest + insn_offset_displacement(&insn);
+		disp = (u8 *) dest + insn_offset_displacement(insn);
 		*(s32 *) disp = (s32) newdisp;
 	}
 #endif
-	return length;
+	return insn->length;
+}
+
+/* Prepare reljump right after instruction to boost */
+static void prepare_boost(struct kprobe *p, struct insn *insn)
+{
+	if (can_boost(insn, p->addr) &&
+	    MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) {
+		/*
+		 * These instructions can be executed directly if it
+		 * jumps back to correct address.
+		 */
+		synthesize_reljump(p->ainsn.insn + insn->length,
+				   p->addr + insn->length);
+		p->ainsn.boostable = true;
+	} else {
+		p->ainsn.boostable = false;
+	}
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
 {
-	int ret;
+	struct insn insn;
+	int len;
+
+	set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
 
 	/* Copy an instruction with recovering if other optprobe modifies it.*/
-	ret = __copy_instruction(p->ainsn.insn, p->addr);
-	if (!ret)
+	len = __copy_instruction(p->ainsn.insn, p->addr, &insn);
+	if (!len)
 		return -EINVAL;
 
 	/*
 	 * __copy_instruction can modify the displacement of the instruction,
 	 * but it doesn't affect boostable check.
 	 */
-	if (can_boost(p->ainsn.insn, p->addr))
-		p->ainsn.boostable = 0;
-	else
-		p->ainsn.boostable = -1;
+	prepare_boost(p, &insn);
+
+	set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
 
 	/* Check whether the instruction modifies Interrupt Flag or not */
 	p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
@@ -459,7 +473,7 @@ void arch_disarm_kprobe(struct kprobe *p)
 void arch_remove_kprobe(struct kprobe *p)
 {
 	if (p->ainsn.insn) {
-		free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+		free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
 		p->ainsn.insn = NULL;
 	}
 }
@@ -531,7 +545,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 		return;
 
 #if !defined(CONFIG_PREEMPT)
-	if (p->ainsn.boostable == 1 && !p->post_handler) {
+	if (p->ainsn.boostable && !p->post_handler) {
 		/* Boost up -- we can execute copied instructions directly */
 		if (!reenter)
 			reset_current_kprobe();
@@ -851,7 +865,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
 	case 0xcf:
 	case 0xea:	/* jmp absolute -- ip is correct */
 		/* ip is already adjusted, no more changes required */
-		p->ainsn.boostable = 1;
+		p->ainsn.boostable = true;
 		goto no_change;
 	case 0xe8:	/* call relative - Fix return addr */
 		*tos = orig_ip + (*tos - copy_ip);
@@ -876,28 +890,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
 			 * jmp near and far, absolute indirect
 			 * ip is correct. And this is boostable
 			 */
-			p->ainsn.boostable = 1;
+			p->ainsn.boostable = true;
 			goto no_change;
 		}
 	default:
 		break;
 	}
 
-	if (p->ainsn.boostable == 0) {
-		if ((regs->ip > copy_ip) &&
-		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
-			/*
-			 * These instructions can be executed directly if it
-			 * jumps back to correct address.
-			 */
-			synthesize_reljump((void *)regs->ip,
-				(void *)orig_ip + (regs->ip - copy_ip));
-			p->ainsn.boostable = 1;
-		} else {
-			p->ainsn.boostable = -1;
-		}
-	}
-
 	regs->ip += orig_ip - copy_ip;
 
 no_change:
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 5f8f0b3cc674..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {
 	p->ainsn.insn = NULL;
-	p->ainsn.boostable = -1;
+	p->ainsn.boostable = false;
 	return 0;
 }
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 3e7c6e5a08ff..9aadff3d0902 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -65,7 +65,10 @@ found:
 	 * overwritten by jump destination address. In this case, original
 	 * bytes must be recovered from op->optinsn.copied_insn buffer.
 	 */
-	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (probe_kernel_read(buf, (void *)addr,
+		MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+		return 0UL;
+
 	if (addr == (unsigned long)kp->addr) {
 		buf[0] = kp->opcode;
 		memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
@@ -174,11 +177,12 @@ NOKPROBE_SYMBOL(optimized_callback);
 
 static int copy_optimized_instructions(u8 *dest, u8 *src)
 {
+	struct insn insn;
 	int len = 0, ret;
 
 	while (len < RELATIVEJUMP_SIZE) {
-		ret = __copy_instruction(dest + len, src + len);
-		if (!ret || !can_boost(dest + len, src + len))
+		ret = __copy_instruction(dest + len, src + len, &insn);
+		if (!ret || !can_boost(&insn, src + len))
 			return -EINVAL;
 		len += ret;
 	}
@@ -350,6 +354,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 	}
 
 	buf = (u8 *)op->optinsn.insn;
+	set_memory_rw((unsigned long)buf & PAGE_MASK, 1);
 
 	/* Copy instructions into the out-of-line buffer */
 	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
@@ -372,6 +377,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
 			   (u8 *)op->kp.addr + op->optinsn.size);
 
+	set_memory_ro((unsigned long)buf & PAGE_MASK, 1);
+
 	flush_icache_range((unsigned long) buf,
 			   (unsigned long) buf + TMPL_END_IDX +
 			   op->optinsn.size + RELATIVEJUMP_SIZE);
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-01 20:23:17 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-01 20:23:17 -0700
commit	7c8c03bfc7b9f5211d8a69eab7fee99c9fb4f449 (patch)
tree	a5cee67325e50e893bf0cc0a0d060983a0df6653 /arch
parent	6dc2cce9321198172cd96f955a5fc798a4cc35a6 (diff)
parent	fd7647979a3948dae4fc6f25dbbdf9ba269bed78 (diff)