14 files changed, 1704 insertions, 179 deletions
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Build b/tools/perf/scripts/python/Perf-Trace-Util/Build
index 7d0e33ce6aba..be3710c61320 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Build
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Build
@@ -1,3 +1,4 @@
-perf-y += Context.o
+perf-util-y += Context.o
 
-CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs
+# -Wno-declaration-after-statement: The python headers have mixed code with declarations (decls after asserts, for instance)
+CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-declaration-after-statement
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 3954bd1587ce..60dcfe56d4d9 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -12,6 +12,7 @@
 #define PY_SSIZE_T_CLEAN
 
 #include <Python.h>
+#include "../../../util/config.h"
 #include "../../../util/trace-event.h"
 #include "../../../util/event.h"
 #include "../../../util/symbol.h"
@@ -23,16 +24,6 @@
 #include "../../../util/srcline.h"
 #include "../../../util/srccode.h"
 
-#if PY_MAJOR_VERSION < 3
-#define _PyCapsule_GetPointer(arg1, arg2) \
-  PyCObject_AsVoidPtr(arg1)
-#define _PyBytes_FromStringAndSize(arg1, arg2) \
-  PyString_FromStringAndSize((arg1), (arg2))
-#define _PyUnicode_AsUTF8(arg) \
-  PyString_AsString(arg)
-
-PyMODINIT_FUNC initperf_trace_context(void);
-#else
 #define _PyCapsule_GetPointer(arg1, arg2) \
   PyCapsule_GetPointer((arg1), (arg2))
 #define _PyBytes_FromStringAndSize(arg1, arg2) \
@@ -41,7 +32,6 @@ PyMODINIT_FUNC initperf_trace_context(void);
   PyUnicode_AsUTF8(arg)
 
 PyMODINIT_FUNC PyInit_perf_trace_context(void);
-#endif
 
 static struct scripting_context *get_args(PyObject *args, const char *name, PyObject **arg2)
 {
@@ -103,7 +93,7 @@ static PyObject *perf_sample_insn(PyObject *obj, PyObject *args)
 	if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) {
 		struct machine *machine =  maps__machine(thread__maps(c->al->thread));
 
-		script_fetch_insn(c->sample, c->al->thread, machine);
+		script_fetch_insn(c->sample, c->al->thread, machine, /*native_arch=*/true);
 	}
 	if (!c->sample->insn_len)
 		Py_RETURN_NONE; /* N.B. This is a return statement */
@@ -182,6 +172,15 @@ static PyObject *perf_sample_srccode(PyObject *obj, PyObject *args)
 	return perf_sample_src(obj, args, true);
 }
 
+static PyObject *__perf_config_get(PyObject *obj, PyObject *args)
+{
+	const char *config_name;
+
+	if (!PyArg_ParseTuple(args, "s", &config_name))
+		return NULL;
+	return Py_BuildValue("s", perf_config_get(config_name));
+}
+
 static PyMethodDef ContextMethods[] = {
 #ifdef HAVE_LIBTRACEEVENT
 	{ "common_pc", perf_trace_context_common_pc, METH_VARARGS,
@@ -199,15 +198,10 @@ static PyMethodDef ContextMethods[] = {
 	  METH_VARARGS,	"Get source file name and line number."},
 	{ "perf_sample_srccode", perf_sample_srccode,
 	  METH_VARARGS,	"Get source file name, line number and line."},
+	{ "perf_config_get", __perf_config_get, METH_VARARGS, "Get perf config entry"},
 	{ NULL, NULL, 0, NULL}
 };
 
-#if PY_MAJOR_VERSION < 3
-PyMODINIT_FUNC initperf_trace_context(void)
-{
-	(void) Py_InitModule("perf_trace_context", ContextMethods);
-}
-#else
 PyMODINIT_FUNC PyInit_perf_trace_context(void)
 {
 	static struct PyModuleDef moduledef = {
@@ -229,4 +223,3 @@ PyMODINIT_FUNC PyInit_perf_trace_context(void)
 
 	return mod;
 }
-#endif
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 7384dcb628c4..b75d31858e54 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -54,6 +54,7 @@ try:
 	import audit
 	machine_to_id = {
 		'x86_64': audit.MACH_86_64,
+		'aarch64': audit.MACH_AARCH64,
 		'alpha'	: audit.MACH_ALPHA,
 		'ia64'	: audit.MACH_IA64,
 		'ppc'	: audit.MACH_PPC,
@@ -73,9 +74,9 @@ try:
 except:
 	if not audit_package_warned:
 		audit_package_warned = True
-		print("Install the audit-libs-python package to get syscall names.\n"
-                    "For example:\n  # apt-get install python-audit (Ubuntu)"
-                    "\n  # yum install audit-libs-python (Fedora)"
+		print("Install the python-audit package to get syscall names.\n"
+                    "For example:\n  # apt-get install python3-audit (Ubuntu)"
+                    "\n  # yum install python3-audit (Fedora)"
                     "\n  etc.\n")
 
 def syscall_name(id):
diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py
index d59ff53f1d94..ba208c90d631 100755
--- a/tools/perf/scripts/python/arm-cs-trace-disasm.py
+++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py
@@ -11,48 +11,98 @@ import os
 from os import path
 import re
 from subprocess import *
-from optparse import OptionParser, make_option
+import argparse
+import platform
 
-from perf_trace_context import perf_set_itrace_options, \
-	perf_sample_insn, perf_sample_srccode
+from perf_trace_context import perf_sample_srccode, perf_config_get
 
 # Below are some example commands for using this script.
+# Note a --kcore recording is required for accurate decode
+# due to the alternatives patching mechanism. However this
+# script only supports reading vmlinux for disassembly dump,
+# meaning that any patched instructions will appear
+# as unpatched, but the instruction ranges themselves will
+# be correct. In addition to this, source line info comes
+# from Perf, and when using kcore there is no debug info. The
+# following lists the supported features in each mode:
+#
+# +-----------+-----------------+------------------+------------------+
+# | Recording | Accurate decode | Source line dump | Disassembly dump |
+# +-----------+-----------------+------------------+------------------+
+# | --kcore   | yes             | no               | yes              |
+# | normal    | no              | yes              | yes              |
+# +-----------+-----------------+------------------+------------------+
+#
+# Output disassembly with objdump and auto detect vmlinux
+# (when running on same machine.)
+#  perf script -s scripts/python/arm-cs-trace-disasm.py -d
 #
-# Output disassembly with objdump:
-#  perf script -s scripts/python/arm-cs-trace-disasm.py \
-#		-- -d objdump -k path/to/vmlinux
 # Output disassembly with llvm-objdump:
 #  perf script -s scripts/python/arm-cs-trace-disasm.py \
 #		-- -d llvm-objdump-11 -k path/to/vmlinux
+#
 # Output only source line and symbols:
 #  perf script -s scripts/python/arm-cs-trace-disasm.py
 
-# Command line parsing.
-option_list = [
-	# formatting options for the bottom entry of the stack
-	make_option("-k", "--vmlinux", dest="vmlinux_name",
-		    help="Set path to vmlinux file"),
-	make_option("-d", "--objdump", dest="objdump_name",
-		    help="Set path to objdump executable file"),
-	make_option("-v", "--verbose", dest="verbose",
-		    action="store_true", default=False,
-		    help="Enable debugging log")
-]
+def default_objdump():
+	config = perf_config_get("annotate.objdump")
+	return config if config else "objdump"
 
-parser = OptionParser(option_list=option_list)
-(options, args) = parser.parse_args()
+# Command line parsing.
+def int_arg(v):
+	v = int(v)
+	if v < 0:
+		raise argparse.ArgumentTypeError("Argument must be a positive integer")
+	return v
+
+args = argparse.ArgumentParser()
+args.add_argument("-k", "--vmlinux",
+		  help="Set path to vmlinux file. Omit to autodetect if running on same machine")
+args.add_argument("-d", "--objdump", nargs="?", const=default_objdump(),
+		  help="Show disassembly. Can also be used to change the objdump path"),
+args.add_argument("-v", "--verbose", action="store_true", help="Enable debugging log")
+args.add_argument("--start-time", type=int_arg, help="Monotonic clock time of sample to start from. "
+		  "See 'time' field on samples in -v mode.")
+args.add_argument("--stop-time", type=int_arg, help="Monotonic clock time of sample to stop at. "
+		  "See 'time' field on samples in -v mode.")
+args.add_argument("--start-sample", type=int_arg, help="Index of sample to start from. "
+		  "See 'index' field on samples in -v mode.")
+args.add_argument("--stop-sample", type=int_arg, help="Index of sample to stop at. "
+		  "See 'index' field on samples in -v mode.")
+
+options = args.parse_args()
+if (options.start_time and options.stop_time and
+    options.start_time >= options.stop_time):
+	print("--start-time must less than --stop-time")
+	exit(2)
+if (options.start_sample and options.stop_sample and
+    options.start_sample >= options.stop_sample):
+	print("--start-sample must less than --stop-sample")
+	exit(2)
 
 # Initialize global dicts and regular expression
 disasm_cache = dict()
 cpu_data = dict()
-disasm_re = re.compile("^\s*([0-9a-fA-F]+):")
-disasm_func_re = re.compile("^\s*([0-9a-fA-F]+)\s.*:")
+disasm_re = re.compile(r"^\s*([0-9a-fA-F]+):")
+disasm_func_re = re.compile(r"^\s*([0-9a-fA-F]+)\s.*:")
 cache_size = 64*1024
+sample_idx = -1
 
 glb_source_file_name	= None
 glb_line_number		= None
 glb_dso			= None
 
+kver = platform.release()
+vmlinux_paths = [
+	f"/usr/lib/debug/boot/vmlinux-{kver}.debug",
+	f"/usr/lib/debug/lib/modules/{kver}/vmlinux",
+	f"/lib/modules/{kver}/build/vmlinux",
+	f"/usr/lib/debug/boot/vmlinux-{kver}",
+	f"/boot/vmlinux-{kver}",
+	f"/boot/vmlinux",
+	f"vmlinux"
+]
+
 def get_optional(perf_dict, field):
        if field in perf_dict:
                return perf_dict[field]
@@ -63,12 +113,25 @@ def get_offset(perf_dict, field):
 		return "+%#x" % perf_dict[field]
 	return ""
 
+def find_vmlinux():
+	if hasattr(find_vmlinux, "path"):
+		return find_vmlinux.path
+
+	for v in vmlinux_paths:
+		if os.access(v, os.R_OK):
+			find_vmlinux.path = v
+			break
+	else:
+		find_vmlinux.path = None
+
+	return find_vmlinux.path
+
 def get_dso_file_path(dso_name, dso_build_id):
 	if (dso_name == "[kernel.kallsyms]" or dso_name == "vmlinux"):
-		if (options.vmlinux_name):
-			return options.vmlinux_name;
+		if (options.vmlinux):
+			return options.vmlinux;
 		else:
-			return dso_name
+			return find_vmlinux() if find_vmlinux() else dso_name
 
 	if (dso_name == "[vdso]") :
 		append = "/vdso"
@@ -92,7 +155,7 @@ def read_disam(dso_fname, dso_start, start_addr, stop_addr):
 	else:
 		start_addr = start_addr - dso_start;
 		stop_addr = stop_addr - dso_start;
-		disasm = [ options.objdump_name, "-d", "-z",
+		disasm = [ options.objdump, "-d", "-z",
 			   "--start-address="+format(start_addr,"#x"),
 			   "--stop-address="+format(stop_addr,"#x") ]
 		disasm += [ dso_fname ]
@@ -112,10 +175,10 @@ def print_disam(dso_fname, dso_start, start_addr, stop_addr):
 
 def print_sample(sample):
 	print("Sample = { cpu: %04d addr: 0x%016x phys_addr: 0x%016x ip: 0x%016x " \
-	      "pid: %d tid: %d period: %d time: %d }" % \
+	      "pid: %d tid: %d period: %d time: %d index: %d}" % \
 	      (sample['cpu'], sample['addr'], sample['phys_addr'], \
 	       sample['ip'], sample['pid'], sample['tid'], \
-	       sample['period'], sample['time']))
+	       sample['period'], sample['time'], sample_idx))
 
 def trace_begin():
 	print('ARM CoreSight Trace Data Assembler Dump')
@@ -177,6 +240,7 @@ def print_srccode(comm, param_dict, sample, symbol, dso):
 def process_event(param_dict):
 	global cache_size
 	global options
+	global sample_idx
 
 	sample = param_dict["sample"]
 	comm = param_dict["comm"]
@@ -187,11 +251,36 @@ def process_event(param_dict):
 	dso_start = get_optional(param_dict, "dso_map_start")
 	dso_end = get_optional(param_dict, "dso_map_end")
 	symbol = get_optional(param_dict, "symbol")
+	map_pgoff = get_optional(param_dict, "map_pgoff")
+	# check for valid map offset
+	if (str(map_pgoff) == '[unknown]'):
+		map_pgoff = 0
+
+	cpu = sample["cpu"]
+	ip = sample["ip"]
+	addr = sample["addr"]
+
+	sample_idx += 1
+
+	if (options.start_time and sample["time"] < options.start_time):
+		return
+	if (options.stop_time and sample["time"] > options.stop_time):
+		exit(0)
+	if (options.start_sample and sample_idx < options.start_sample):
+		return
+	if (options.stop_sample and sample_idx > options.stop_sample):
+		exit(0)
 
 	if (options.verbose == True):
 		print("Event type: %s" % name)
 		print_sample(sample)
 
+	# Initialize CPU data if it's empty, and directly return back
+	# if this is the first tracing event for this CPU.
+	if (cpu_data.get(str(cpu) + 'addr') == None):
+		cpu_data[str(cpu) + 'addr'] = addr
+		return
+
 	# If cannot find dso so cannot dump assembler, bail out
 	if (dso == '[unknown]'):
 		return
@@ -209,16 +298,6 @@ def process_event(param_dict):
 	if (name[0:8] != "branches"):
 		return
 
-	cpu = sample["cpu"]
-	ip = sample["ip"]
-	addr = sample["addr"]
-
-	# Initialize CPU data if it's empty, and directly return back
-	# if this is the first tracing event for this CPU.
-	if (cpu_data.get(str(cpu) + 'addr') == None):
-		cpu_data[str(cpu) + 'addr'] = addr
-		return
-
 	# The format for packet is:
 	#
 	#		  +------------+------------+------------+
@@ -243,9 +322,10 @@ def process_event(param_dict):
 	# Record for previous sample packet
 	cpu_data[str(cpu) + 'addr'] = addr
 
-	# Handle CS_ETM_TRACE_ON packet if start_addr=0 and stop_addr=4
-	if (start_addr == 0 and stop_addr == 4):
-		print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu)
+	# Filter out zero start_address. Optionally identify CS_ETM_TRACE_ON packet
+	if (start_addr == 0):
+		if ((stop_addr == 4) and (options.verbose == True)):
+			print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu)
 		return
 
 	if (start_addr < int(dso_start) or start_addr > int(dso_end)):
@@ -256,18 +336,20 @@ def process_event(param_dict):
 		print("Stop address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (stop_addr, int(dso_start), int(dso_end), dso))
 		return
 
-	if (options.objdump_name != None):
+	if (options.objdump != None):
 		# It doesn't need to decrease virtual memory offset for disassembly
-		# for kernel dso, so in this case we set vm_start to zero.
-		if (dso == "[kernel.kallsyms]"):
+		# for kernel dso and executable file dso, so in this case we set
+		# vm_start to zero.
+		if (dso == "[kernel.kallsyms]" or dso_start == 0x400000):
 			dso_vm_start = 0
+			map_pgoff = 0
 		else:
 			dso_vm_start = int(dso_start)
 
 		dso_fname = get_dso_file_path(dso, dso_bid)
 		if path.exists(dso_fname):
-			print_disam(dso_fname, dso_vm_start, start_addr, stop_addr)
+			print_disam(dso_fname, dso_vm_start, start_addr + map_pgoff, stop_addr + map_pgoff)
 		else:
-			print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr, stop_addr))
+			print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr + map_pgoff, stop_addr + map_pgoff))
 
 	print_srccode(comm, param_dict, sample, symbol, dso)
diff --git a/tools/perf/scripts/python/bin/flamegraph-report b/tools/perf/scripts/python/bin/flamegraph-report
index 53c5dc90c87e..453a6918afbe 100755
--- a/tools/perf/scripts/python/bin/flamegraph-report
+++ b/tools/perf/scripts/python/bin/flamegraph-report
@@ -1,3 +1,3 @@
 #!/bin/bash
 # description: create flame graphs
-perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py -- "$@"
+perf script -s "$PERF_EXEC_PATH"/scripts/python/flamegraph.py "$@"
diff --git a/tools/perf/scripts/python/bin/gecko-record b/tools/perf/scripts/python/bin/gecko-record
new file mode 100644
index 000000000000..f0d1aa55f171
--- /dev/null
+++ b/tools/perf/scripts/python/bin/gecko-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -F 99 -g "$@"
diff --git a/tools/perf/scripts/python/bin/gecko-report b/tools/perf/scripts/python/bin/gecko-report
new file mode 100755
index 000000000000..1867ec8d9757
--- /dev/null
+++ b/tools/perf/scripts/python/bin/gecko-report
@@ -0,0 +1,7 @@
+#!/bin/bash
+# description: create firefox gecko profile json format from perf.data
+if [ "$*" = "-i -" ]; then
+perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py
+else
+perf script -s "$PERF_EXEC_PATH"/scripts/python/gecko.py -- "$@"
+fi
diff --git a/tools/perf/scripts/python/compaction-times.py b/tools/perf/scripts/python/compaction-times.py
index 2560a042dc6f..9401f7c14747 100644
--- a/tools/perf/scripts/python/compaction-times.py
+++ b/tools/perf/scripts/python/compaction-times.py
@@ -260,7 +260,7 @@ def pr_help():
 
 comm_re = None
 pid_re = None
-pid_regex = "^(\d*)-(\d*)$|^(\d*)$"
+pid_regex = r"^(\d*)-(\d*)$|^(\d*)$"
 
 opt_proc = popt.DISP_DFL
 opt_disp = topt.DISP_ALL
diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py
index 13f2d8a81610..e0b2e7268ef6 100755
--- a/tools/perf/scripts/python/exported-sql-viewer.py
+++ b/tools/perf/scripts/python/exported-sql-viewer.py
@@ -677,10 +677,13 @@ class CallGraphModelBase(TreeModel):
 			#   sqlite supports GLOB (text only) which uses * and ? and is case sensitive
 			if not self.glb.dbref.is_sqlite3:
 				# Escape % and _
-				s = value.replace("%", "\%")
-				s = s.replace("_", "\_")
+				s = value.replace("%", "\\%")
+				s = s.replace("_", "\\_")
 				# Translate * and ? into SQL LIKE pattern characters % and _
-				trans = string.maketrans("*?", "%_")
+				if sys.version_info[0] == 3:
+					trans = str.maketrans("*?", "%_")
+				else:
+					trans = string.maketrans("*?", "%_")
 				match = " LIKE '" + str(s).translate(trans) + "'"
 			else:
 				match = " GLOB '" + str(value) + "'"
diff --git a/tools/perf/scripts/python/flamegraph.py b/tools/perf/scripts/python/flamegraph.py
index cf7ce8229a6c..ad735990c5be 100755
--- a/tools/perf/scripts/python/flamegraph.py
+++ b/tools/perf/scripts/python/flamegraph.py
@@ -18,7 +18,6 @@
 # pylint: disable=missing-class-docstring
 # pylint: disable=missing-function-docstring
 
-from __future__ import print_function
 import argparse
 import hashlib
 import io
@@ -26,9 +25,10 @@ import json
 import os
 import subprocess
 import sys
+from typing import Dict, Optional, Union
 import urllib.request
 
-minimal_html = """<head>
+MINIMAL_HTML = """<head>
   <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/d3-flamegraph.css">
 </head>
 <body>
@@ -50,20 +50,20 @@ minimal_html = """<head>
 
 # pylint: disable=too-few-public-methods
 class Node:
-    def __init__(self, name, libtype):
+    def __init__(self, name: str, libtype: str):
         self.name = name
         # "root" | "kernel" | ""
         # "" indicates user space
         self.libtype = libtype
-        self.value = 0
-        self.children = []
+        self.value: int = 0
+        self.children: list[Node] = []
 
-    def to_json(self):
+    def to_json(self) -> Dict[str, Union[str, int, list[Dict]]]:
         return {
             "n": self.name,
             "l": self.libtype,
             "v": self.value,
-            "c": self.children
+            "c": [x.to_json() for x in self.children]
         }
 
 
@@ -73,7 +73,7 @@ class FlameGraphCLI:
         self.stack = Node("all", "root")
 
     @staticmethod
-    def get_libtype_from_dso(dso):
+    def get_libtype_from_dso(dso: Optional[str]) -> str:
         """
         when kernel-debuginfo is installed,
         dso points to /usr/lib/debug/lib/modules/*/vmlinux
@@ -84,7 +84,7 @@ class FlameGraphCLI:
         return ""
 
     @staticmethod
-    def find_or_create_node(node, name, libtype):
+    def find_or_create_node(node: Node, name: str, libtype: str) -> Node:
         for child in node.children:
             if child.name == name:
                 return child
@@ -93,7 +93,12 @@ class FlameGraphCLI:
         node.children.append(child)
         return child
 
-    def process_event(self, event):
+    def process_event(self, event) -> None:
+        # ignore events where the event name does not match
+        # the one specified by the user
+        if self.args.event_name and event.get("ev_name") != self.args.event_name:
+            return
+
         pid = event.get("sample", {}).get("pid", 0)
         # event["dso"] sometimes contains /usr/lib/debug/lib/modules/*/vmlinux
         # for user-space processes; let's use pid for kernel or user-space distinction
@@ -101,7 +106,7 @@ class FlameGraphCLI:
             comm = event["comm"]
             libtype = "kernel"
         else:
-            comm = "{} ({})".format(event["comm"], pid)
+            comm = f"{event['comm']} ({pid})"
             libtype = ""
         node = self.find_or_create_node(self.stack, comm, libtype)
 
@@ -116,20 +121,30 @@ class FlameGraphCLI:
             node = self.find_or_create_node(node, name, libtype)
         node.value += 1
 
-    def get_report_header(self):
+    def get_report_header(self) -> str:
         if self.args.input == "-":
             # when this script is invoked with "perf script flamegraph",
             # no perf.data is created and we cannot read the header of it
             return ""
 
         try:
-            output = subprocess.check_output(["perf", "report", "--header-only"])
-            return output.decode("utf-8")
+            # if the file name other than perf.data is given,
+            # we read the header of that file
+            if self.args.input:
+                output = subprocess.check_output(["perf", "report", "--header-only",
+                                                  "-i", self.args.input])
+            else:
+                output = subprocess.check_output(["perf", "report", "--header-only"])
+
+            result = output.decode("utf-8")
+            if self.args.event_name:
+                result += "\nFocused event: " + self.args.event_name
+            return result
         except Exception as err:  # pylint: disable=broad-except
-            print("Error reading report header: {}".format(err), file=sys.stderr)
+            print(f"Error reading report header: {err}", file=sys.stderr)
             return ""
 
-    def trace_end(self):
+    def trace_end(self) -> None:
         stacks_json = json.dumps(self.stack, default=lambda x: x.to_json())
 
         if self.args.format == "html":
@@ -153,7 +168,8 @@ graph template (--template PATH) or use another output format (--format
 FORMAT).""",
                               file=sys.stderr)
                         if self.args.input == "-":
-                            print("""Not attempting to download Flame Graph template as script command line
+                            print(
+"""Not attempting to download Flame Graph template as script command line
 input is disabled due to using live mode. If you want to download the
 template retry without live mode. For example, use 'perf record -a -g
 -F 99 sleep 60' and 'perf script report flamegraph'. Alternatively,
@@ -162,37 +178,40 @@ https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/templates/d3-flamegraph-b
 and place it at:
 /usr/share/d3-flame-graph/d3-flamegraph-base.html""",
                                   file=sys.stderr)
-                            quit()
+                            sys.exit(1)
                         s = None
-                        while s != "y" and s != "n":
-                            s = input("Do you wish to download a template from cdn.jsdelivr.net? (this warning can be suppressed with --allow-download) [yn] ").lower()
+                        while s not in ["y", "n"]:
+                            s = input("Do you wish to download a template from cdn.jsdelivr.net?" +
+                                      "(this warning can be suppressed with --allow-download) [yn] "
+                                      ).lower()
                         if s == "n":
-                            quit()
-                    template = "https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/templates/d3-flamegraph-base.html"
+                            sys.exit(1)
+                    template = ("https://cdn.jsdelivr.net/npm/d3-flame-graph@4.1.3/dist/templates/"
+                                "d3-flamegraph-base.html")
                     template_md5sum = "143e0d06ba69b8370b9848dcd6ae3f36"
 
             try:
-                with urllib.request.urlopen(template) as template:
+                with urllib.request.urlopen(template) as url_template:
                     output_str = "".join([
-                        l.decode("utf-8") for l in template.readlines()
+                        l.decode("utf-8") for l in url_template.readlines()
                     ])
             except Exception as err:
                 print(f"Error reading template {template}: {err}\n"
                       "a minimal flame graph will be generated", file=sys.stderr)
-                output_str = minimal_html
+                output_str = MINIMAL_HTML
                 template_md5sum = None
 
             if template_md5sum:
                 download_md5sum = hashlib.md5(output_str.encode("utf-8")).hexdigest()
                 if download_md5sum != template_md5sum:
                     s = None
-                    while s != "y" and s != "n":
+                    while s not in ["y", "n"]:
                         s = input(f"""Unexpected template md5sum.
 {download_md5sum} != {template_md5sum}, for:
 {output_str}
 continue?[yn] """).lower()
                     if s == "n":
-                        quit()
+                        sys.exit(1)
 
             output_str = output_str.replace("/** @options_json **/", options_json)
             output_str = output_str.replace("/** @flamegraph_json **/", stacks_json)
@@ -206,12 +225,12 @@ continue?[yn] """).lower()
             with io.open(sys.stdout.fileno(), "w", encoding="utf-8", closefd=False) as out:
                 out.write(output_str)
         else:
-            print("dumping data to {}".format(output_fn))
+            print(f"dumping data to {output_fn}")
             try:
                 with io.open(output_fn, "w", encoding="utf-8") as out:
                     out.write(output_str)
             except IOError as err:
-                print("Error writing output file: {}".format(err), file=sys.stderr)
+                print(f"Error writing output file: {err}", file=sys.stderr)
                 sys.exit(1)
 
 
@@ -235,6 +254,11 @@ if __name__ == "__main__":
                         default=False,
                         action="store_true",
                         help="allow unprompted downloading of HTML template")
+    parser.add_argument("-e", "--event",
+                        default="",
+                        dest="event_name",
+                        type=str,
+                        help="specify the event to generate flamegraph for")
 
     cli_args = parser.parse_args()
     cli = FlameGraphCLI(cli_args)
diff --git a/tools/perf/scripts/python/gecko.py b/tools/perf/scripts/python/gecko.py
new file mode 100644
index 000000000000..bc5a72f94bfa
--- /dev/null
+++ b/tools/perf/scripts/python/gecko.py
@@ -0,0 +1,395 @@
+# gecko.py - Convert perf record output to Firefox's gecko profile format
+# SPDX-License-Identifier: GPL-2.0
+#
+# The script converts perf.data to Gecko Profile Format,
+# which can be read by https://profiler.firefox.com/.
+#
+# Usage:
+#
+#     perf record -a -g -F 99 sleep 60
+#     perf script report gecko
+#
+# Combined:
+#
+#     perf script gecko -F 99 -a sleep 60
+
+import os
+import sys
+import time
+import json
+import string
+import random
+import argparse
+import threading
+import webbrowser
+import urllib.parse
+from os import system
+from functools import reduce
+from dataclasses import dataclass, field
+from http.server import HTTPServer, SimpleHTTPRequestHandler, test
+from typing import List, Dict, Optional, NamedTuple, Set, Tuple, Any
+
+# Add the Perf-Trace-Util library to the Python path
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+StringID = int
+StackID = int
+FrameID = int
+CategoryID = int
+Milliseconds = float
+
+# start_time is intialiazed only once for the all event traces.
+start_time = None
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/profile.js#L425
+# Follow Brendan Gregg's Flamegraph convention: orange for kernel and yellow for user space by default.
+CATEGORIES = None
+
+# The product name is used by the profiler UI to show the Operating system and Processor.
+PRODUCT = os.popen('uname -op').read().strip()
+
+# store the output file
+output_file = None
+
+# Here key = tid, value = Thread
+tid_to_thread = dict()
+
+# The HTTP server is used to serve the profile to the profiler UI.
+http_server_thread = None
+
+# The category index is used by the profiler UI to show the color of the flame graph.
+USER_CATEGORY_INDEX = 0
+KERNEL_CATEGORY_INDEX = 1
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156
+class Frame(NamedTuple):
+	string_id: StringID
+	relevantForJS: bool
+	innerWindowID: int
+	implementation: None
+	optimizations: None
+	line: None
+	column: None
+	category: CategoryID
+	subcategory: int
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216
+class Stack(NamedTuple):
+	prefix_id: Optional[StackID]
+	frame_id: FrameID
+
+# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90
+class Sample(NamedTuple):
+	stack_id: Optional[StackID]
+	time_ms: Milliseconds
+	responsiveness: int
+
+@dataclass
+class Thread:
+	"""A builder for a profile of the thread.
+
+	Attributes:
+		comm: Thread command-line (name).
+		pid: process ID of containing process.
+		tid: thread ID.
+		samples: Timeline of profile samples.
+		frameTable: interned stack frame ID -> stack frame.
+		stringTable: interned string ID -> string.
+		stringMap: interned string -> string ID.
+		stackTable: interned stack ID -> stack.
+		stackMap: (stack prefix ID, leaf stack frame ID) -> interned Stack ID.
+		frameMap: Stack Frame string -> interned Frame ID.
+		comm: str
+		pid: int
+		tid: int
+		samples: List[Sample] = field(default_factory=list)
+		frameTable: List[Frame] = field(default_factory=list)
+		stringTable: List[str] = field(default_factory=list)
+		stringMap: Dict[str, int] = field(default_factory=dict)
+		stackTable: List[Stack] = field(default_factory=list)
+		stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict)
+		frameMap: Dict[str, int] = field(default_factory=dict)
+	"""
+	comm: str
+	pid: int
+	tid: int
+	samples: List[Sample] = field(default_factory=list)
+	frameTable: List[Frame] = field(default_factory=list)
+	stringTable: List[str] = field(default_factory=list)
+	stringMap: Dict[str, int] = field(default_factory=dict)
+	stackTable: List[Stack] = field(default_factory=list)
+	stackMap: Dict[Tuple[Optional[int], int], int] = field(default_factory=dict)
+	frameMap: Dict[str, int] = field(default_factory=dict)
+
+	def _intern_stack(self, frame_id: int, prefix_id: Optional[int]) -> int:
+		"""Gets a matching stack, or saves the new stack. Returns a Stack ID."""
+		key = f"{frame_id}" if prefix_id is None else f"{frame_id},{prefix_id}"
+		# key = (prefix_id, frame_id)
+		stack_id = self.stackMap.get(key)
+		if stack_id is None:
+			# return stack_id
+			stack_id = len(self.stackTable)
+			self.stackTable.append(Stack(prefix_id=prefix_id, frame_id=frame_id))
+			self.stackMap[key] = stack_id
+		return stack_id
+
+	def _intern_string(self, string: str) -> int:
+		"""Gets a matching string, or saves the new string. Returns a String ID."""
+		string_id = self.stringMap.get(string)
+		if string_id is not None:
+			return string_id
+		string_id = len(self.stringTable)
+		self.stringTable.append(string)
+		self.stringMap[string] = string_id
+		return string_id
+
+	def _intern_frame(self, frame_str: str) -> int:
+		"""Gets a matching stack frame, or saves the new frame. Returns a Frame ID."""
+		frame_id = self.frameMap.get(frame_str)
+		if frame_id is not None:
+			return frame_id
+		frame_id = len(self.frameTable)
+		self.frameMap[frame_str] = frame_id
+		string_id = self._intern_string(frame_str)
+
+		symbol_name_to_category = KERNEL_CATEGORY_INDEX if frame_str.find('kallsyms') != -1 \
+		or frame_str.find('/vmlinux') != -1 \
+		or frame_str.endswith('.ko)') \
+		else USER_CATEGORY_INDEX
+
+		self.frameTable.append(Frame(
+			string_id=string_id,
+			relevantForJS=False,
+			innerWindowID=0,
+			implementation=None,
+			optimizations=None,
+			line=None,
+			column=None,
+			category=symbol_name_to_category,
+			subcategory=None,
+		))
+		return frame_id
+
+	def _add_sample(self, comm: str, stack: List[str], time_ms: Milliseconds) -> None:
+		"""Add a timestamped stack trace sample to the thread builder.
+		Args:
+			comm: command-line (name) of the thread at this sample
+			stack: sampled stack frames. Root first, leaf last.
+			time_ms: timestamp of sample in milliseconds.
+		"""
+		# Ihreads may not set their names right after they are created.
+		# Instead, they might do it later. In such situations, to use the latest name they have set.
+		if self.comm != comm:
+			self.comm = comm
+
+		prefix_stack_id = reduce(lambda prefix_id, frame: self._intern_stack
+						(self._intern_frame(frame), prefix_id), stack, None)
+		if prefix_stack_id is not None:
+			self.samples.append(Sample(stack_id=prefix_stack_id,
+									time_ms=time_ms,
+									responsiveness=0))
+
+	def _to_json_dict(self) -> Dict:
+		"""Converts current Thread to GeckoThread JSON format."""
+		# Gecko profile format is row-oriented data as List[List],
+		# And a schema for interpreting each index.
+		# Schema:
+		# https://github.com/firefox-devtools/profiler/blob/main/docs-developer/gecko-profile-format.md
+		# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L230
+		return {
+			"tid": self.tid,
+			"pid": self.pid,
+			"name": self.comm,
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L51
+			"markers": {
+				"schema": {
+					"name": 0,
+					"startTime": 1,
+					"endTime": 2,
+					"phase": 3,
+					"category": 4,
+					"data": 5,
+				},
+				"data": [],
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L90
+			"samples": {
+				"schema": {
+					"stack": 0,
+					"time": 1,
+					"responsiveness": 2,
+				},
+				"data": self.samples
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L156
+			"frameTable": {
+				"schema": {
+					"location": 0,
+					"relevantForJS": 1,
+					"innerWindowID": 2,
+					"implementation": 3,
+					"optimizations": 4,
+					"line": 5,
+					"column": 6,
+					"category": 7,
+					"subcategory": 8,
+				},
+				"data": self.frameTable,
+			},
+
+			# https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L216
+			"stackTable": {
+				"schema": {
+					"prefix": 0,
+					"frame": 1,
+				},
+				"data": self.stackTable,
+			},
+			"stringTable": self.stringTable,
+			"registerTime": 0,
+			"unregisterTime": None,
+			"processType": "default",
+		}
+
+# Uses perf script python interface to parse each
+# event and store the data in the thread builder.
+def process_event(param_dict: Dict) -> None:
+	global start_time
+	global tid_to_thread
+	time_stamp = (param_dict['sample']['time'] // 1000) / 1000
+	pid = param_dict['sample']['pid']
+	tid = param_dict['sample']['tid']
+	comm = param_dict['comm']
+
+	# Start time is the time of the first sample
+	if not start_time:
+		start_time = time_stamp
+
+	# Parse and append the callchain of the current sample into a stack.
+	stack = []
+	if param_dict['callchain']:
+		for call in param_dict['callchain']:
+			if 'sym' not in call:
+				continue
+			stack.append(f'{call["sym"]["name"]} (in {call["dso"]})')
+		if len(stack) != 0:
+			# Reverse the stack, as root come first and the leaf at the end.
+			stack = stack[::-1]
+
+	# During perf record if -g is not used, the callchain is not available.
+	# In that case, the symbol and dso are available in the event parameters.
+	else:
+		func = param_dict['symbol'] if 'symbol' in param_dict else '[unknown]'
+		dso = param_dict['dso'] if 'dso' in param_dict else '[unknown]'
+		stack.append(f'{func} (in {dso})')
+
+	# Add sample to the specific thread.
+	thread = tid_to_thread.get(tid)
+	if thread is None:
+		thread = Thread(comm=comm, pid=pid, tid=tid)
+		tid_to_thread[tid] = thread
+	thread._add_sample(comm=comm, stack=stack, time_ms=time_stamp)
+
+def trace_begin() -> None:
+	global output_file
+	if (output_file is None):
+		print("Staring Firefox Profiler on your default browser...")
+		global http_server_thread
+		http_server_thread = threading.Thread(target=test, args=(CORSRequestHandler, HTTPServer,))
+		http_server_thread.daemon = True
+		http_server_thread.start()
+
+# Trace_end runs at the end and will be used to aggregate
+# the data into the final json object and print it out to stdout.
+def trace_end() -> None:
+	global output_file
+	threads = [thread._to_json_dict() for thread in tid_to_thread.values()]
+
+	# Schema: https://github.com/firefox-devtools/profiler/blob/53970305b51b9b472e26d7457fee1d66cd4e2737/src/types/gecko-profile.js#L305
+	gecko_profile_with_meta = {
+		"meta": {
+			"interval": 1,
+			"processType": 0,
+			"product": PRODUCT,
+			"stackwalk": 1,
+			"debug": 0,
+			"gcpoison": 0,
+			"asyncstack": 1,
+			"startTime": start_time,
+			"shutdownTime": None,
+			"version": 24,
+			"presymbolicated": True,
+			"categories": CATEGORIES,
+			"markerSchema": [],
+			},
+		"libs": [],
+		"threads": threads,
+		"processes": [],
+		"pausedRanges": [],
+	}
+	# launch the profiler on local host if not specified --save-only args, otherwise print to file
+	if (output_file is None):
+		output_file = 'gecko_profile.json'
+		with open(output_file, 'w') as f:
+			json.dump(gecko_profile_with_meta, f, indent=2)
+		launchFirefox(output_file)
+		time.sleep(1)
+		print(f'[ perf gecko: Captured and wrote into {output_file} ]')
+	else:
+		print(f'[ perf gecko: Captured and wrote into {output_file} ]')
+		with open(output_file, 'w') as f:
+			json.dump(gecko_profile_with_meta, f, indent=2)
+
+# Used to enable Cross-Origin Resource Sharing (CORS) for requests coming from 'https://profiler.firefox.com', allowing it to access resources from this server.
+class CORSRequestHandler(SimpleHTTPRequestHandler):
+	def end_headers (self):
+		self.send_header('Access-Control-Allow-Origin', 'https://profiler.firefox.com')
+		SimpleHTTPRequestHandler.end_headers(self)
+
+# start a local server to serve the gecko_profile.json file to the profiler.firefox.com
+def launchFirefox(file):
+	safe_string = urllib.parse.quote_plus(f'http://localhost:8000/{file}')
+	url = 'https://profiler.firefox.com/from-url/' + safe_string
+	webbrowser.open(f'{url}')
+
+def main() -> None:
+	global output_file
+	global CATEGORIES
+	parser = argparse.ArgumentParser(description="Convert perf.data to Firefox\'s Gecko Profile format which can be uploaded to profiler.firefox.com for visualization")
+
+	# Add the command-line options
+	# Colors must be defined according to this:
+	# https://github.com/firefox-devtools/profiler/blob/50124adbfa488adba6e2674a8f2618cf34b59cd2/res/css/categories.css
+	parser.add_argument('--user-color', default='yellow', help='Color for the User category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta'])
+	parser.add_argument('--kernel-color', default='orange', help='Color for the Kernel category', choices=['yellow', 'blue', 'purple', 'green', 'orange', 'red', 'grey', 'magenta'])
+	# If --save-only is specified, the output will be saved to a file instead of opening Firefox's profiler directly.
+	parser.add_argument('--save-only', help='Save the output to a file instead of opening Firefox\'s profiler')
+
+	# Parse the command-line arguments
+	args = parser.parse_args()
+	# Access the values provided by the user
+	user_color = args.user_color
+	kernel_color = args.kernel_color
+	output_file = args.save_only
+
+	CATEGORIES = [
+		{
+			"name": 'User',
+			"color": user_color,
+			"subcategories": ['Other']
+		},
+		{
+			"name": 'Kernel',
+			"color": kernel_color,
+			"subcategories": ['Other']
+		},
+	]
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/perf/scripts/python/mem-phys-addr.py b/tools/perf/scripts/python/mem-phys-addr.py
index 1f332e72b9b0..5e237a5a5f1b 100644
--- a/tools/perf/scripts/python/mem-phys-addr.py
+++ b/tools/perf/scripts/python/mem-phys-addr.py
@@ -3,98 +3,125 @@
 #
 # Copyright (c) 2018, Intel Corporation.
 
-from __future__ import division
-from __future__ import print_function
-
 import os
 import sys
-import struct
 import re
 import bisect
 import collections
+from dataclasses import dataclass
+from typing import (Dict, Optional)
 
 sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+    '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+@dataclass(frozen=True)
+class IomemEntry:
+    """Read from a line in /proc/iomem"""
+    begin: int
+    end: int
+    indent: int
+    label: str
 
-#physical address ranges for System RAM
-system_ram = []
-#physical address ranges for Persistent Memory
-pmem = []
-#file object for proc iomem
-f = None
-#Count for each type of memory
-load_mem_type_cnt = collections.Counter()
-#perf event name
-event_name = None
+# Physical memory layout from /proc/iomem. Key is the indent and then
+# a list of ranges.
+iomem: Dict[int, list[IomemEntry]] = collections.defaultdict(list)
+# Child nodes from the iomem parent.
+children: Dict[IomemEntry, set[IomemEntry]] = collections.defaultdict(set)
+# Maximum indent seen before an entry in the iomem file.
+max_indent: int = 0
+# Count for each range of memory.
+load_mem_type_cnt: Dict[IomemEntry, int] = collections.Counter()
+# Perf event name set from the first sample in the data.
+event_name: Optional[str] = None
 
 def parse_iomem():
-	global f
-	f = open('/proc/iomem', 'r')
-	for i, j in enumerate(f):
-		m = re.split('-|:',j,2)
-		if m[2].strip() == 'System RAM':
-			system_ram.append(int(m[0], 16))
-			system_ram.append(int(m[1], 16))
-		if m[2].strip() == 'Persistent Memory':
-			pmem.append(int(m[0], 16))
-			pmem.append(int(m[1], 16))
+    """Populate iomem from /proc/iomem file"""
+    global iomem
+    global max_indent
+    global children
+    with open('/proc/iomem', 'r', encoding='ascii') as f:
+        for line in f:
+            indent = 0
+            while line[indent] == ' ':
+                indent += 1
+            if indent > max_indent:
+                max_indent = indent
+            m = re.split('-|:', line, 2)
+            begin = int(m[0], 16)
+            end = int(m[1], 16)
+            label = m[2].strip()
+            entry = IomemEntry(begin, end, indent, label)
+            # Before adding entry, search for a parent node using its begin.
+            if indent > 0:
+                parent = find_memory_type(begin)
+                assert parent, f"Given indent expected a parent for {label}"
+                children[parent].add(entry)
+            iomem[indent].append(entry)
 
-def print_memory_type():
-	print("Event: %s" % (event_name))
-	print("%-40s  %10s  %10s\n" % ("Memory type", "count", "percentage"), end='')
-	print("%-40s  %10s  %10s\n" % ("----------------------------------------",
-					"-----------", "-----------"),
-					end='');
-	total = sum(load_mem_type_cnt.values())
-	for mem_type, count in sorted(load_mem_type_cnt.most_common(), \
-					key = lambda kv: (kv[1], kv[0]), reverse = True):
-		print("%-40s  %10d  %10.1f%%\n" %
-			(mem_type, count, 100 * count / total),
-			end='')
+def find_memory_type(phys_addr) -> Optional[IomemEntry]:
+    """Search iomem for the range containing phys_addr with the maximum indent"""
+    for i in range(max_indent, -1, -1):
+        if i not in iomem:
+            continue
+        position = bisect.bisect_right(iomem[i], phys_addr,
+                                       key=lambda entry: entry.begin)
+        if position is None:
+            continue
+        iomem_entry = iomem[i][position-1]
+        if  iomem_entry.begin <= phys_addr <= iomem_entry.end:
+            return iomem_entry
+    print(f"Didn't find {phys_addr}")
+    return None
 
-def trace_begin():
-	parse_iomem()
+def print_memory_type():
+    print(f"Event: {event_name}")
+    print(f"{'Memory type':<40}  {'count':>10}  {'percentage':>10}")
+    print(f"{'-' * 40:<40}  {'-' * 10:>10}  {'-' * 10:>10}")
+    total = sum(load_mem_type_cnt.values())
+    # Add count from children into the parent.
+    for i in range(max_indent, -1, -1):
+        if i not in iomem:
+            continue
+        for entry in iomem[i]:
+            global children
+            for child in children[entry]:
+                if load_mem_type_cnt[child] > 0:
+                    load_mem_type_cnt[entry] += load_mem_type_cnt[child]
 
-def trace_end():
-	print_memory_type()
-	f.close()
+    def print_entries(entries):
+        """Print counts from parents down to their children"""
+        global children
+        for entry in sorted(entries,
+                            key = lambda entry: load_mem_type_cnt[entry],
+                            reverse = True):
+            count = load_mem_type_cnt[entry]
+            if count > 0:
+                mem_type = ' ' * entry.indent + f"{entry.begin:x}-{entry.end:x} : {entry.label}"
+                percent = 100 * count / total
+                print(f"{mem_type:<40}  {count:>10}  {percent:>10.1f}")
+                print_entries(children[entry])
 
-def is_system_ram(phys_addr):
-	#/proc/iomem is sorted
-	position = bisect.bisect(system_ram, phys_addr)
-	if position % 2 == 0:
-		return False
-	return True
+    print_entries(iomem[0])
 
-def is_persistent_mem(phys_addr):
-	position = bisect.bisect(pmem, phys_addr)
-	if position % 2 == 0:
-		return False
-	return True
+def trace_begin():
+    parse_iomem()
 
-def find_memory_type(phys_addr):
-	if phys_addr == 0:
-		return "N/A"
-	if is_system_ram(phys_addr):
-		return "System RAM"
+def trace_end():
+    print_memory_type()
 
-	if is_persistent_mem(phys_addr):
-		return "Persistent Memory"
+def process_event(param_dict):
+    if "sample" not in param_dict:
+        return
 
-	#slow path, search all
-	f.seek(0, 0)
-	for j in f:
-		m = re.split('-|:',j,2)
-		if int(m[0], 16) <= phys_addr <= int(m[1], 16):
-			return m[2]
-	return "N/A"
+    sample = param_dict["sample"]
+    if "phys_addr" not in sample:
+        return
 
-def process_event(param_dict):
-	name       = param_dict["ev_name"]
-	sample     = param_dict["sample"]
-	phys_addr  = sample["phys_addr"]
+    phys_addr  = sample["phys_addr"]
+    entry = find_memory_type(phys_addr)
+    if entry:
+        load_mem_type_cnt[entry] += 1
 
-	global event_name
-	if event_name == None:
-		event_name = name
-	load_mem_type_cnt[find_memory_type(phys_addr)] += 1
+    global event_name
+    if event_name is None:
+        event_name  = param_dict["ev_name"]
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
index 00552eeb7178..30c4bccee5b2 100644
--- a/tools/perf/scripts/python/netdev-times.py
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -293,7 +293,8 @@ def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm, callchain,
 			skbaddr, location, protocol, reason)
 	all_event_list.append(event_info)
 
-def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, callchain, skbaddr):
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, callchain,
+			skbaddr, location):
 	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
 			skbaddr)
 	all_event_list.append(event_info)
diff --git a/tools/perf/scripts/python/parallel-perf.py b/tools/perf/scripts/python/parallel-perf.py
new file mode 100755
index 000000000000..be85fd7f6632
--- /dev/null
+++ b/tools/perf/scripts/python/parallel-perf.py
@@ -0,0 +1,989 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a perf script command multiple times in parallel, using perf script
+# options --cpu and --time so that each job processes a different chunk
+# of the data.
+#
+# Copyright (c) 2024, Intel Corporation.
+
+import subprocess
+import argparse
+import pathlib
+import shlex
+import time
+import copy
+import sys
+import os
+import re
+
+glb_prog_name = "parallel-perf.py"
+glb_min_interval = 10.0
+glb_min_samples = 64
+
+class Verbosity():
+
+	def __init__(self, quiet=False, verbose=False, debug=False):
+		self.normal    = True
+		self.verbose   = verbose
+		self.debug     = debug
+		self.self_test = True
+		if self.debug:
+			self.verbose = True
+		if self.verbose:
+			quiet = False
+		if quiet:
+			self.normal = False
+
+# Manage work (Start/Wait/Kill), as represented by a subprocess.Popen command
+class Work():
+
+	def __init__(self, cmd, pipe_to, output_dir="."):
+		self.popen = None
+		self.consumer = None
+		self.cmd = cmd
+		self.pipe_to = pipe_to
+		self.output_dir = output_dir
+		self.cmdout_name = f"{output_dir}/cmd.txt"
+		self.stdout_name = f"{output_dir}/out.txt"
+		self.stderr_name = f"{output_dir}/err.txt"
+
+	def Command(self):
+		sh_cmd = [ shlex.quote(x) for x in self.cmd ]
+		return " ".join(self.cmd)
+
+	def Stdout(self):
+		return open(self.stdout_name, "w")
+
+	def Stderr(self):
+		return open(self.stderr_name, "w")
+
+	def CreateOutputDir(self):
+		pathlib.Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+	def Start(self):
+		if self.popen:
+			return
+		self.CreateOutputDir()
+		with open(self.cmdout_name, "w") as f:
+			f.write(self.Command())
+			f.write("\n")
+		stdout = self.Stdout()
+		stderr = self.Stderr()
+		if self.pipe_to:
+			self.popen = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=stderr)
+			args = shlex.split(self.pipe_to)
+			self.consumer = subprocess.Popen(args, stdin=self.popen.stdout, stdout=stdout, stderr=stderr)
+		else:
+			self.popen = subprocess.Popen(self.cmd, stdout=stdout, stderr=stderr)
+
+	def RemoveEmptyErrFile(self):
+		if os.path.exists(self.stderr_name):
+			if os.path.getsize(self.stderr_name) == 0:
+				os.unlink(self.stderr_name)
+
+	def Errors(self):
+		if os.path.exists(self.stderr_name):
+			if os.path.getsize(self.stderr_name) != 0:
+				return [ f"Non-empty error file {self.stderr_name}" ]
+		return []
+
+	def TidyUp(self):
+		self.RemoveEmptyErrFile()
+
+	def RawPollWait(self, p, wait):
+		if wait:
+			return p.wait()
+		return p.poll()
+
+	def Poll(self, wait=False):
+		if not self.popen:
+			return None
+		result = self.RawPollWait(self.popen, wait)
+		if self.consumer:
+			res = result
+			result = self.RawPollWait(self.consumer, wait)
+			if result != None and res == None:
+				self.popen.kill()
+				result = None
+			elif result == 0 and res != None and res != 0:
+				result = res
+		if result != None:
+			self.TidyUp()
+		return result
+
+	def Wait(self):
+		return self.Poll(wait=True)
+
+	def Kill(self):
+		if not self.popen:
+			return
+		self.popen.kill()
+		if self.consumer:
+			self.consumer.kill()
+
+def KillWork(worklist, verbosity):
+	for w in worklist:
+		w.Kill()
+	for w in worklist:
+		w.Wait()
+
+def NumberOfCPUs():
+	return os.sysconf("SC_NPROCESSORS_ONLN")
+
+def NanoSecsToSecsStr(x):
+	if x == None:
+		return ""
+	x = str(x)
+	if len(x) < 10:
+		x = "0" * (10 - len(x)) + x
+	return x[:len(x) - 9] + "." + x[-9:]
+
+def InsertOptionAfter(cmd, option, after):
+	try:
+		pos = cmd.index(after)
+		cmd.insert(pos + 1, option)
+	except:
+		cmd.append(option)
+
+def CreateWorkList(cmd, pipe_to, output_dir, cpus, time_ranges_by_cpu):
+	max_len = len(str(cpus[-1]))
+	cpu_dir_fmt = f"cpu-%.{max_len}u"
+	worklist = []
+	pos = 0
+	for cpu in cpus:
+		if cpu >= 0:
+			cpu_dir = os.path.join(output_dir, cpu_dir_fmt % cpu)
+			cpu_option = f"--cpu={cpu}"
+		else:
+			cpu_dir = output_dir
+			cpu_option = None
+
+		tr_dir_fmt = "time-range"
+
+		if len(time_ranges_by_cpu) > 1:
+			time_ranges = time_ranges_by_cpu[pos]
+			tr_dir_fmt += f"-{pos}"
+			pos += 1
+		else:
+			time_ranges = time_ranges_by_cpu[0]
+
+		max_len = len(str(len(time_ranges)))
+		tr_dir_fmt += f"-%.{max_len}u"
+
+		i = 0
+		for r in time_ranges:
+			if r == [None, None]:
+				time_option = None
+				work_output_dir = cpu_dir
+			else:
+				time_option = "--time=" + NanoSecsToSecsStr(r[0]) + "," + NanoSecsToSecsStr(r[1])
+				work_output_dir = os.path.join(cpu_dir, tr_dir_fmt % i)
+				i += 1
+			work_cmd = list(cmd)
+			if time_option != None:
+				InsertOptionAfter(work_cmd, time_option, "script")
+			if cpu_option != None:
+				InsertOptionAfter(work_cmd, cpu_option, "script")
+			w = Work(work_cmd, pipe_to, work_output_dir)
+			worklist.append(w)
+	return worklist
+
+def DoRunWork(worklist, nr_jobs, verbosity):
+	nr_to_do = len(worklist)
+	not_started = list(worklist)
+	running = []
+	done = []
+	chg = False
+	while True:
+		nr_done = len(done)
+		if chg and verbosity.normal:
+			nr_run = len(running)
+			print(f"\rThere are {nr_to_do} jobs: {nr_done} completed, {nr_run} running", flush=True, end=" ")
+			if verbosity.verbose:
+				print()
+			chg = False
+		if nr_done == nr_to_do:
+			break
+		while len(running) < nr_jobs and len(not_started):
+			w = not_started.pop(0)
+			running.append(w)
+			if verbosity.verbose:
+				print("Starting:", w.Command())
+			w.Start()
+			chg = True
+		if len(running):
+			time.sleep(0.1)
+		finished = []
+		not_finished = []
+		while len(running):
+			w = running.pop(0)
+			r = w.Poll()
+			if r == None:
+				not_finished.append(w)
+				continue
+			if r == 0:
+				if verbosity.verbose:
+					print("Finished:", w.Command())
+				finished.append(w)
+				chg = True
+				continue
+			if verbosity.normal and not verbosity.verbose:
+				print()
+			print("Job failed!\n    return code:", r, "\n    command:    ", w.Command())
+			if w.pipe_to:
+				print("    piped to:   ", w.pipe_to)
+			print("Killing outstanding jobs")
+			KillWork(not_finished, verbosity)
+			KillWork(running, verbosity)
+			return False
+		running = not_finished
+		done += finished
+	errorlist = []
+	for w in worklist:
+		errorlist += w.Errors()
+	if len(errorlist):
+		print("Errors:")
+		for e in errorlist:
+			print(e)
+	elif verbosity.normal:
+		print("\r"," "*50, "\rAll jobs finished successfully", flush=True)
+	return True
+
+def RunWork(worklist, nr_jobs=NumberOfCPUs(), verbosity=Verbosity()):
+	try:
+		return DoRunWork(worklist, nr_jobs, verbosity)
+	except:
+		for w in worklist:
+			w.Kill()
+		raise
+	return True
+
+def ReadHeader(perf, file_name):
+	return subprocess.Popen([perf, "script", "--header-only", "--input", file_name], stdout=subprocess.PIPE).stdout.read().decode("utf-8")
+
+def ParseHeader(hdr):
+	result = {}
+	lines = hdr.split("\n")
+	for line in lines:
+		if ":" in line and line[0] == "#":
+			pos = line.index(":")
+			name = line[1:pos-1].strip()
+			value = line[pos+1:].strip()
+			if name in result:
+				orig_name = name
+				nr = 2
+				while True:
+					name = f"{orig_name} {nr}"
+					if name not in result:
+						break
+					nr += 1
+			result[name] = value
+	return result
+
+def HeaderField(hdr_dict, hdr_fld):
+	if hdr_fld not in hdr_dict:
+		raise Exception(f"'{hdr_fld}' missing from header information")
+	return hdr_dict[hdr_fld]
+
+# Represent the position of an option within a command string
+# and provide the option value and/or remove the option
+class OptPos():
+
+	def Init(self, opt_element=-1, value_element=-1, opt_pos=-1, value_pos=-1, error=None):
+		self.opt_element = opt_element		# list element that contains option
+		self.value_element = value_element	# list element that contains option value
+		self.opt_pos = opt_pos			# string position of option
+		self.value_pos = value_pos		# string position of value
+		self.error = error			# error message string
+
+	def __init__(self, args, short_name, long_name, default=None):
+		self.args = list(args)
+		self.default = default
+		n = 2 + len(long_name)
+		m = len(short_name)
+		pos = -1
+		for opt in args:
+			pos += 1
+			if m and opt[:2] == f"-{short_name}":
+				if len(opt) == 2:
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, 0, 0)
+					else:
+						self.Init(error = f"-{short_name} option missing value")
+				else:
+					self.Init(pos, pos, 0, 2)
+				return
+			if opt[:n] == f"--{long_name}":
+				if len(opt) == n:
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, 0, 0)
+					else:
+						self.Init(error = f"--{long_name} option missing value")
+				elif opt[n] == "=":
+					self.Init(pos, pos, 0, n + 1)
+				else:
+					self.Init(error = f"--{long_name} option expected '='")
+				return
+			if m and opt[:1] == "-" and opt[:2] != "--" and short_name in opt:
+				ipos = opt.index(short_name)
+				if "-" in opt[1:]:
+					hpos = opt[1:].index("-")
+					if hpos < ipos:
+						continue
+				if ipos + 1 == len(opt):
+					if pos + 1 < len(args):
+						self.Init(pos, pos + 1, ipos, 0)
+					else:
+						self.Init(error = f"-{short_name} option missing value")
+				else:
+					self.Init(pos, pos, ipos, ipos + 1)
+				return
+		self.Init()
+
+	def Value(self):
+		if self.opt_element >= 0:
+			if self.opt_element != self.value_element:
+				return self.args[self.value_element]
+			else:
+				return self.args[self.value_element][self.value_pos:]
+		return self.default
+
+	def Remove(self, args):
+		if self.opt_element == -1:
+			return
+		if self.opt_element != self.value_element:
+			del args[self.value_element]
+		if self.opt_pos:
+			args[self.opt_element] = args[self.opt_element][:self.opt_pos]
+		else:
+			del args[self.opt_element]
+
+def DetermineInputFileName(cmd):
+	p = OptPos(cmd, "i", "input", "perf.data")
+	if p.error:
+		raise Exception(f"perf command {p.error}")
+	file_name = p.Value()
+	if not os.path.exists(file_name):
+		raise Exception(f"perf command input file '{file_name}' not found")
+	return file_name
+
+def ReadOption(args, short_name, long_name, err_prefix, remove=False):
+	p = OptPos(args, short_name, long_name)
+	if p.error:
+		raise Exception(f"{err_prefix}{p.error}")
+	value = p.Value()
+	if remove:
+		p.Remove(args)
+	return value
+
+def ExtractOption(args, short_name, long_name, err_prefix):
+	return ReadOption(args, short_name, long_name, err_prefix, True)
+
+def ReadPerfOption(args, short_name, long_name):
+	return ReadOption(args, short_name, long_name, "perf command ")
+
+def ExtractPerfOption(args, short_name, long_name):
+	return ExtractOption(args, short_name, long_name, "perf command ")
+
+def PerfDoubleQuickCommands(cmd, file_name):
+	cpu_str = ReadPerfOption(cmd, "C", "cpu")
+	time_str = ReadPerfOption(cmd, "", "time")
+	# Use double-quick sampling to determine trace data density
+	times_cmd = ["perf", "script", "--ns", "--input", file_name, "--itrace=qqi"]
+	if cpu_str != None and cpu_str != "":
+		times_cmd.append(f"--cpu={cpu_str}")
+	if time_str != None and time_str != "":
+		times_cmd.append(f"--time={time_str}")
+	cnts_cmd = list(times_cmd)
+	cnts_cmd.append("-Fcpu")
+	times_cmd.append("-Fcpu,time")
+	return cnts_cmd, times_cmd
+
+class CPUTimeRange():
+	def __init__(self, cpu):
+		self.cpu = cpu
+		self.sample_cnt = 0
+		self.time_ranges = None
+		self.interval = 0
+		self.interval_remaining = 0
+		self.remaining = 0
+		self.tr_pos = 0
+
+def CalcTimeRangesByCPU(line, cpu, cpu_time_ranges, max_time):
+	cpu_time_range = cpu_time_ranges[cpu]
+	cpu_time_range.remaining -= 1
+	cpu_time_range.interval_remaining -= 1
+	if cpu_time_range.remaining == 0:
+		cpu_time_range.time_ranges[cpu_time_range.tr_pos][1] = max_time
+		return
+	if cpu_time_range.interval_remaining == 0:
+		time = TimeVal(line[1][:-1], 0)
+		time_ranges = cpu_time_range.time_ranges
+		time_ranges[cpu_time_range.tr_pos][1] = time - 1
+		time_ranges.append([time, max_time])
+		cpu_time_range.tr_pos += 1
+		cpu_time_range.interval_remaining = cpu_time_range.interval
+
+def CountSamplesByCPU(line, cpu, cpu_time_ranges):
+	try:
+		cpu_time_ranges[cpu].sample_cnt += 1
+	except:
+		print("exception")
+		print("cpu", cpu)
+		print("len(cpu_time_ranges)", len(cpu_time_ranges))
+		raise
+
+def ProcessCommandOutputLines(cmd, per_cpu, fn, *x):
+	# Assume CPU number is at beginning of line and enclosed by []
+	pat = re.compile(r"\s*\[[0-9]+\]")
+	p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+	while True:
+		line = p.stdout.readline()
+		if line:
+			line = line.decode("utf-8")
+			if pat.match(line):
+				line = line.split()
+				if per_cpu:
+					# Assumes CPU number is enclosed by []
+					cpu = int(line[0][1:-1])
+				else:
+					cpu = 0
+				fn(line, cpu, *x)
+		else:
+			break
+	p.wait()
+
+def IntersectTimeRanges(new_time_ranges, time_ranges):
+	pos = 0
+	new_pos = 0
+	# Can assume len(time_ranges) != 0 and len(new_time_ranges) != 0
+	# Note also, there *must* be at least one intersection.
+	while pos < len(time_ranges) and new_pos < len(new_time_ranges):
+		# new end < old start => no intersection, remove new
+		if new_time_ranges[new_pos][1] < time_ranges[pos][0]:
+			del new_time_ranges[new_pos]
+			continue
+		# new start > old end => no intersection, check next
+		if new_time_ranges[new_pos][0] > time_ranges[pos][1]:
+			pos += 1
+			if pos < len(time_ranges):
+				continue
+			# no next, so remove remaining
+			while new_pos < len(new_time_ranges):
+				del new_time_ranges[new_pos]
+			return
+		# Found an intersection
+		# new start < old start => adjust new start = old start
+		if new_time_ranges[new_pos][0] < time_ranges[pos][0]:
+			new_time_ranges[new_pos][0] = time_ranges[pos][0]
+		# new end > old end => keep the overlap, insert the remainder
+		if new_time_ranges[new_pos][1] > time_ranges[pos][1]:
+			r = [ time_ranges[pos][1] + 1, new_time_ranges[new_pos][1] ]
+			new_time_ranges[new_pos][1] = time_ranges[pos][1]
+			new_pos += 1
+			new_time_ranges.insert(new_pos, r)
+			continue
+		# new [start, end] is within old [start, end]
+		new_pos += 1
+
+def SplitTimeRangesByTraceDataDensity(time_ranges, cpus, nr, cmd, file_name, per_cpu, min_size, min_interval, verbosity):
+	if verbosity.normal:
+		print("\rAnalyzing...", flush=True, end=" ")
+		if verbosity.verbose:
+			print()
+	cnts_cmd, times_cmd = PerfDoubleQuickCommands(cmd, file_name)
+
+	nr_cpus = cpus[-1] + 1 if per_cpu else 1
+	if per_cpu:
+		nr_cpus = cpus[-1] + 1
+		cpu_time_ranges = [ CPUTimeRange(cpu) for cpu in range(nr_cpus) ]
+	else:
+		nr_cpus = 1
+		cpu_time_ranges = [ CPUTimeRange(-1) ]
+
+	if verbosity.debug:
+		print("nr_cpus", nr_cpus)
+		print("cnts_cmd", cnts_cmd)
+		print("times_cmd", times_cmd)
+
+	# Count the number of "double quick" samples per CPU
+	ProcessCommandOutputLines(cnts_cmd, per_cpu, CountSamplesByCPU, cpu_time_ranges)
+
+	tot = 0
+	mx = 0
+	for cpu_time_range in cpu_time_ranges:
+		cnt = cpu_time_range.sample_cnt
+		tot += cnt
+		if cnt > mx:
+			mx = cnt
+		if verbosity.debug:
+			print("cpu:", cpu_time_range.cpu, "sample_cnt", cnt)
+
+	if min_size < 1:
+		min_size = 1
+
+	if mx < min_size:
+		# Too little data to be worth splitting
+		if verbosity.debug:
+			print("Too little data to split by time")
+		if nr == 0:
+			nr = 1
+		return [ SplitTimeRangesIntoN(time_ranges, nr, min_interval) ]
+
+	if nr:
+		divisor = nr
+		min_size = 1
+	else:
+		divisor = NumberOfCPUs()
+
+	interval = int(round(tot / divisor, 0))
+	if interval < min_size:
+		interval = min_size
+
+	if verbosity.debug:
+		print("divisor", divisor)
+		print("min_size", min_size)
+		print("interval", interval)
+
+	min_time = time_ranges[0][0]
+	max_time = time_ranges[-1][1]
+
+	for cpu_time_range in cpu_time_ranges:
+		cnt = cpu_time_range.sample_cnt
+		if cnt == 0:
+			cpu_time_range.time_ranges = copy.deepcopy(time_ranges)
+			continue
+		# Adjust target interval for CPU to give approximately equal interval sizes
+		# Determine number of intervals, rounding to nearest integer
+		n = int(round(cnt / interval, 0))
+		if n < 1:
+			n = 1
+		# Determine interval size, rounding up
+		d, m = divmod(cnt, n)
+		if m:
+			d += 1
+		cpu_time_range.interval = d
+		cpu_time_range.interval_remaining = d
+		cpu_time_range.remaining = cnt
+		# Init. time ranges for each CPU with the start time
+		cpu_time_range.time_ranges = [ [min_time, max_time] ]
+
+	# Set time ranges so that the same number of "double quick" samples
+	# will fall into each time range.
+	ProcessCommandOutputLines(times_cmd, per_cpu, CalcTimeRangesByCPU, cpu_time_ranges, max_time)
+
+	for cpu_time_range in cpu_time_ranges:
+		if cpu_time_range.sample_cnt:
+			IntersectTimeRanges(cpu_time_range.time_ranges, time_ranges)
+
+	return [cpu_time_ranges[cpu].time_ranges for cpu in cpus]
+
+def SplitSingleTimeRangeIntoN(time_range, n):
+	if n <= 1:
+		return [time_range]
+	start = time_range[0]
+	end   = time_range[1]
+	duration = int((end - start + 1) / n)
+	if duration < 1:
+		return [time_range]
+	time_ranges = []
+	for i in range(n):
+		time_ranges.append([start, start + duration - 1])
+		start += duration
+	time_ranges[-1][1] = end
+	return time_ranges
+
+def TimeRangeDuration(r):
+	return r[1] - r[0] + 1
+
+def TotalDuration(time_ranges):
+	duration = 0
+	for r in time_ranges:
+		duration += TimeRangeDuration(r)
+	return duration
+
+def SplitTimeRangesByInterval(time_ranges, interval):
+	new_ranges = []
+	for r in time_ranges:
+		duration = TimeRangeDuration(r)
+		n = duration / interval
+		n = int(round(n, 0))
+		new_ranges += SplitSingleTimeRangeIntoN(r, n)
+	return new_ranges
+
+def SplitTimeRangesIntoN(time_ranges, n, min_interval):
+	if n <= len(time_ranges):
+		return time_ranges
+	duration = TotalDuration(time_ranges)
+	interval = duration / n
+	if interval < min_interval:
+		interval = min_interval
+	return SplitTimeRangesByInterval(time_ranges, interval)
+
+def RecombineTimeRanges(tr):
+	new_tr = copy.deepcopy(tr)
+	n = len(new_tr)
+	i = 1
+	while i < len(new_tr):
+		# if prev end + 1 == cur start, combine them
+		if new_tr[i - 1][1] + 1 == new_tr[i][0]:
+			new_tr[i][0] = new_tr[i - 1][0]
+			del new_tr[i - 1]
+		else:
+			i += 1
+	return new_tr
+
+def OpenTimeRangeEnds(time_ranges, min_time, max_time):
+	if time_ranges[0][0] <= min_time:
+		time_ranges[0][0] = None
+	if time_ranges[-1][1] >= max_time:
+		time_ranges[-1][1] = None
+
+def BadTimeStr(time_str):
+	raise Exception(f"perf command bad time option: '{time_str}'\nCheck also 'time of first sample' and 'time of last sample' in perf script --header-only")
+
+def ValidateTimeRanges(time_ranges, time_str):
+	n = len(time_ranges)
+	for i in range(n):
+		start = time_ranges[i][0]
+		end   = time_ranges[i][1]
+		if i != 0 and start <= time_ranges[i - 1][1]:
+			BadTimeStr(time_str)
+		if start > end:
+			BadTimeStr(time_str)
+
+def TimeVal(s, dflt):
+	s = s.strip()
+	if s == "":
+		return dflt
+	a = s.split(".")
+	if len(a) > 2:
+		raise Exception(f"Bad time value'{s}'")
+	x = int(a[0])
+	if x < 0:
+		raise Exception("Negative time not allowed")
+	x *= 1000000000
+	if len(a) > 1:
+		x += int((a[1] + "000000000")[:9])
+	return x
+
+def BadCPUStr(cpu_str):
+	raise Exception(f"perf command bad cpu option: '{cpu_str}'\nCheck also 'nrcpus avail' in perf script --header-only")
+
+def ParseTimeStr(time_str, min_time, max_time):
+	if time_str == None or time_str == "":
+		return [[min_time, max_time]]
+	time_ranges = []
+	for r in time_str.split():
+		a = r.split(",")
+		if len(a) != 2:
+			BadTimeStr(time_str)
+		try:
+			start = TimeVal(a[0], min_time)
+			end   = TimeVal(a[1], max_time)
+		except:
+			BadTimeStr(time_str)
+		time_ranges.append([start, end])
+	ValidateTimeRanges(time_ranges, time_str)
+	return time_ranges
+
+def ParseCPUStr(cpu_str, nr_cpus):
+	if cpu_str == None or cpu_str == "":
+		return [-1]
+	cpus = []
+	for r in cpu_str.split(","):
+		a = r.split("-")
+		if len(a) < 1 or len(a) > 2:
+			BadCPUStr(cpu_str)
+		try:
+			start = int(a[0].strip())
+			if len(a) > 1:
+				end = int(a[1].strip())
+			else:
+				end = start
+		except:
+			BadCPUStr(cpu_str)
+		if start < 0 or end < 0 or end < start or end >= nr_cpus:
+			BadCPUStr(cpu_str)
+		cpus.extend(range(start, end + 1))
+	cpus = list(set(cpus)) # Remove duplicates
+	cpus.sort()
+	return cpus
+
+class ParallelPerf():
+
+	def __init__(self, a):
+		for arg_name in vars(a):
+			setattr(self, arg_name, getattr(a, arg_name))
+		self.orig_nr = self.nr
+		self.orig_cmd = list(self.cmd)
+		self.perf = self.cmd[0]
+		if os.path.exists(self.output_dir):
+			raise Exception(f"Output '{self.output_dir}' already exists")
+		if self.jobs < 0 or self.nr < 0 or self.interval < 0:
+			raise Exception("Bad options (negative values): try -h option for help")
+		if self.nr != 0 and self.interval != 0:
+			raise Exception("Cannot specify number of time subdivisions and time interval")
+		if self.jobs == 0:
+			self.jobs = NumberOfCPUs()
+		if self.nr == 0 and self.interval == 0:
+			if self.per_cpu:
+				self.nr = 1
+			else:
+				self.nr = self.jobs
+
+	def Init(self):
+		if self.verbosity.debug:
+			print("cmd", self.cmd)
+		self.file_name = DetermineInputFileName(self.cmd)
+		self.hdr = ReadHeader(self.perf, self.file_name)
+		self.hdr_dict = ParseHeader(self.hdr)
+		self.cmd_line = HeaderField(self.hdr_dict, "cmdline")
+
+	def ExtractTimeInfo(self):
+		self.min_time = TimeVal(HeaderField(self.hdr_dict, "time of first sample"), 0)
+		self.max_time = TimeVal(HeaderField(self.hdr_dict, "time of last sample"), 0)
+		self.time_str = ExtractPerfOption(self.cmd, "", "time")
+		self.time_ranges = ParseTimeStr(self.time_str, self.min_time, self.max_time)
+		if self.verbosity.debug:
+			print("time_ranges", self.time_ranges)
+
+	def ExtractCPUInfo(self):
+		if self.per_cpu:
+			nr_cpus = int(HeaderField(self.hdr_dict, "nrcpus avail"))
+			self.cpu_str = ExtractPerfOption(self.cmd, "C", "cpu")
+			if self.cpu_str == None or self.cpu_str == "":
+				self.cpus = [ x for x in range(nr_cpus) ]
+			else:
+				self.cpus = ParseCPUStr(self.cpu_str, nr_cpus)
+		else:
+			self.cpu_str = None
+			self.cpus = [-1]
+		if self.verbosity.debug:
+			print("cpus", self.cpus)
+
+	def IsIntelPT(self):
+		return self.cmd_line.find("intel_pt") >= 0
+
+	def SplitTimeRanges(self):
+		if self.IsIntelPT() and self.interval == 0:
+			self.split_time_ranges_for_each_cpu = \
+				SplitTimeRangesByTraceDataDensity(self.time_ranges, self.cpus, self.orig_nr,
+								  self.orig_cmd, self.file_name, self.per_cpu,
+								  self.min_size, self.min_interval, self.verbosity)
+		elif self.nr:
+			self.split_time_ranges_for_each_cpu = [ SplitTimeRangesIntoN(self.time_ranges, self.nr, self.min_interval) ]
+		else:
+			self.split_time_ranges_for_each_cpu = [ SplitTimeRangesByInterval(self.time_ranges, self.interval) ]
+
+	def CheckTimeRanges(self):
+		for tr in self.split_time_ranges_for_each_cpu:
+			# Re-combined time ranges should be the same
+			new_tr = RecombineTimeRanges(tr)
+			if new_tr != self.time_ranges:
+				if self.verbosity.debug:
+					print("tr", tr)
+					print("new_tr", new_tr)
+				raise Exception("Self test failed!")
+
+	def OpenTimeRangeEnds(self):
+		for time_ranges in self.split_time_ranges_for_each_cpu:
+			OpenTimeRangeEnds(time_ranges, self.min_time, self.max_time)
+
+	def CreateWorkList(self):
+		self.worklist = CreateWorkList(self.cmd, self.pipe_to, self.output_dir, self.cpus, self.split_time_ranges_for_each_cpu)
+
+	def PerfDataRecordedPerCPU(self):
+		if "--per-thread" in self.cmd_line.split():
+			return False
+		return True
+
+	def DefaultToPerCPU(self):
+		# --no-per-cpu option takes precedence
+		if self.no_per_cpu:
+			return False
+		if not self.PerfDataRecordedPerCPU():
+			return False
+		# Default to per-cpu for Intel PT data that was recorded per-cpu,
+		# because decoding can be done for each CPU separately.
+		if self.IsIntelPT():
+			return True
+		return False
+
+	def Config(self):
+		self.Init()
+		self.ExtractTimeInfo()
+		if not self.per_cpu:
+			self.per_cpu = self.DefaultToPerCPU()
+		if self.verbosity.debug:
+			print("per_cpu", self.per_cpu)
+		self.ExtractCPUInfo()
+		self.SplitTimeRanges()
+		if self.verbosity.self_test:
+			self.CheckTimeRanges()
+		# Prefer open-ended time range to starting / ending with min_time / max_time resp.
+		self.OpenTimeRangeEnds()
+		self.CreateWorkList()
+
+	def Run(self):
+		if self.dry_run:
+			print(len(self.worklist),"jobs:")
+			for w in self.worklist:
+				print(w.Command())
+			return True
+		result = RunWork(self.worklist, self.jobs, verbosity=self.verbosity)
+		if self.verbosity.verbose:
+			print(glb_prog_name, "done")
+		return result
+
+def RunParallelPerf(a):
+	pp = ParallelPerf(a)
+	pp.Config()
+	return pp.Run()
+
+def Main(args):
+	ap = argparse.ArgumentParser(
+		prog=glb_prog_name, formatter_class = argparse.RawDescriptionHelpFormatter,
+		description =
+"""
+Run a perf script command multiple times in parallel, using perf script options
+--cpu and --time so that each job processes a different chunk of the data.
+""",
+		epilog =
+"""
+Follow the options by '--' and then the perf script command e.g.
+
+	$ perf record -a -- sleep 10
+	$ parallel-perf.py --nr=4 -- perf script --ns
+	All jobs finished successfully
+	$ tree parallel-perf-output/
+	parallel-perf-output/
+	├── time-range-0
+	│   ├── cmd.txt
+	│   └── out.txt
+	├── time-range-1
+	│   ├── cmd.txt
+	│   └── out.txt
+	├── time-range-2
+	│   ├── cmd.txt
+	│   └── out.txt
+	└── time-range-3
+	    ├── cmd.txt
+	    └── out.txt
+	$ find parallel-perf-output -name cmd.txt | sort | xargs grep -H .
+	parallel-perf-output/time-range-0/cmd.txt:perf script --time=,9466.504461499 --ns
+	parallel-perf-output/time-range-1/cmd.txt:perf script --time=9466.504461500,9469.005396999 --ns
+	parallel-perf-output/time-range-2/cmd.txt:perf script --time=9469.005397000,9471.506332499 --ns
+	parallel-perf-output/time-range-3/cmd.txt:perf script --time=9471.506332500, --ns
+
+Any perf script command can be used, including the use of perf script options
+--dlfilter and --script, so that the benefit of running parallel jobs
+naturally extends to them also.
+
+If option --pipe-to is used, standard output is first piped through that
+command. Beware, if the command fails (e.g. grep with no matches), it will be
+considered a fatal error.
+
+Final standard output is redirected to files named out.txt in separate
+subdirectories under the output directory. Similarly, standard error is
+written to files named err.txt. In addition, files named cmd.txt contain the
+corresponding perf script command. After processing, err.txt files are removed
+if they are empty.
+
+If any job exits with a non-zero exit code, then all jobs are killed and no
+more are started. A message is printed if any job results in a non-empty
+err.txt file.
+
+There is a separate output subdirectory for each time range. If the --per-cpu
+option is used, these are further grouped under cpu-n subdirectories, e.g.
+
+	$ parallel-perf.py --per-cpu --nr=2 -- perf script --ns --cpu=0,1
+	All jobs finished successfully
+	$ tree parallel-perf-output
+	parallel-perf-output/
+	├── cpu-0
+	│   ├── time-range-0
+	│   │   ├── cmd.txt
+	│   │   └── out.txt
+	│   └── time-range-1
+	│       ├── cmd.txt
+	│       └── out.txt
+	└── cpu-1
+	    ├── time-range-0
+	    │   ├── cmd.txt
+	    │   └── out.txt
+	    └── time-range-1
+	        ├── cmd.txt
+	        └── out.txt
+	$ find parallel-perf-output -name cmd.txt | sort | xargs grep -H .
+	parallel-perf-output/cpu-0/time-range-0/cmd.txt:perf script --cpu=0 --time=,9469.005396999 --ns
+	parallel-perf-output/cpu-0/time-range-1/cmd.txt:perf script --cpu=0 --time=9469.005397000, --ns
+	parallel-perf-output/cpu-1/time-range-0/cmd.txt:perf script --cpu=1 --time=,9469.005396999 --ns
+	parallel-perf-output/cpu-1/time-range-1/cmd.txt:perf script --cpu=1 --time=9469.005397000, --ns
+
+Subdivisions of time range, and cpus if the --per-cpu option is used, are
+expressed by the --time and --cpu perf script options respectively. If the
+supplied perf script command has a --time option, then that time range is
+subdivided, otherwise the time range given by 'time of first sample' to
+'time of last sample' is used (refer perf script --header-only). Similarly, the
+supplied perf script command may provide a --cpu option, and only those CPUs
+will be processed.
+
+To prevent time intervals becoming too small, the --min-interval option can
+be used.
+
+Note there is special handling for processing Intel PT traces. If an interval is
+not specified and the perf record command contained the intel_pt event, then the
+time range will be subdivided in order to produce subdivisions that contain
+approximately the same amount of trace data. That is accomplished by counting
+double-quick (--itrace=qqi) samples, and choosing time ranges that encompass
+approximately the same number of samples. In that case, time ranges may not be
+the same for each CPU processed. For Intel PT, --per-cpu is the default, but
+that can be overridden by --no-per-cpu. Note, for Intel PT, double-quick
+decoding produces 1 sample for each PSB synchronization packet, which in turn
+come after a certain number of bytes output, determined by psb_period (refer
+perf Intel PT documentation). The minimum number of double-quick samples that
+will define a time range can be set by the --min_size option, which defaults to
+64.
+""")
+	ap.add_argument("-o", "--output-dir", default="parallel-perf-output", help="output directory (default 'parallel-perf-output')")
+	ap.add_argument("-j", "--jobs", type=int, default=0, help="maximum number of jobs to run in parallel at one time (default is the number of CPUs)")
+	ap.add_argument("-n", "--nr", type=int, default=0, help="number of time subdivisions (default is the number of jobs)")
+	ap.add_argument("-i", "--interval", type=float, default=0, help="subdivide the time range using this time interval (in seconds e.g. 0.1 for a tenth of a second)")
+	ap.add_argument("-c", "--per-cpu", action="store_true", help="process data for each CPU in parallel")
+	ap.add_argument("-m", "--min-interval", type=float, default=glb_min_interval, help=f"minimum interval (default {glb_min_interval} seconds)")
+	ap.add_argument("-p", "--pipe-to", help="command to pipe output to (optional)")
+	ap.add_argument("-N", "--no-per-cpu", action="store_true", help="do not process data for each CPU in parallel")
+	ap.add_argument("-b", "--min_size", type=int, default=glb_min_samples, help="minimum data size (for Intel PT in PSBs)")
+	ap.add_argument("-D", "--dry-run", action="store_true", help="do not run any jobs, just show the perf script commands")
+	ap.add_argument("-q", "--quiet", action="store_true", help="do not print any messages except errors")
+	ap.add_argument("-v", "--verbose", action="store_true", help="print more messages")
+	ap.add_argument("-d", "--debug", action="store_true", help="print debugging messages")
+	cmd_line = list(args)
+	try:
+		split_pos = cmd_line.index("--")
+		cmd = cmd_line[split_pos + 1:]
+		args = cmd_line[:split_pos]
+	except:
+		cmd = None
+		args = cmd_line
+	a = ap.parse_args(args=args[1:])
+	a.cmd = cmd
+	a.verbosity = Verbosity(a.quiet, a.verbose, a.debug)
+	try:
+		if a.cmd == None:
+			if len(args) <= 1:
+				ap.print_help()
+				return True
+			raise Exception("Command line must contain '--' before perf command")
+		return RunParallelPerf(a)
+	except Exception as e:
+		print("Fatal error: ", str(e))
+		if a.debug:
+			raise
+		return False
+
+if __name__ == "__main__":
+	if not Main(sys.argv):
+		sys.exit(1)