summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/accounting/Makefile2
-rw-r--r--tools/accounting/delaytop.c862
-rw-r--r--tools/accounting/getdelays.c167
-rw-r--r--tools/power/x86/turbostat/turbostat.811
-rw-r--r--tools/power/x86/turbostat/turbostat.c768
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_misc.h3
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c62
-rw-r--r--tools/testing/selftests/drivers/net/Makefile1
-rwxr-xr-xtools/testing/selftests/drivers/net/napi_threaded.py111
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/nexthop.sh2
-rw-r--r--tools/testing/selftests/kho/arm64.conf9
-rw-r--r--tools/testing/selftests/kho/init.c100
-rwxr-xr-xtools/testing/selftests/kho/vmtest.sh183
-rw-r--r--tools/testing/selftests/kho/x86.conf7
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/process_madv.c344
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh5
-rwxr-xr-xtools/testing/selftests/net/packetdrill/ksft_runner.sh19
-rwxr-xr-xtools/testing/selftests/net/test_neigh.sh6
-rwxr-xr-xtools/testing/selftests/net/vlan_hw_filter.sh16
-rw-r--r--tools/testing/selftests/perf_events/.gitignore1
-rw-r--r--tools/testing/selftests/perf_events/Makefile2
-rw-r--r--tools/testing/selftests/perf_events/mmap.c236
-rw-r--r--tools/testing/selftests/ptrace/.gitignore1
-rw-r--r--tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c16
-rw-r--r--tools/testing/vma/vma_internal.h6
27 files changed, 2598 insertions, 344 deletions
diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile
index 11def1ad046c..20bbd461515e 100644
--- a/tools/accounting/Makefile
+++ b/tools/accounting/Makefile
@@ -2,7 +2,7 @@
CC := $(CROSS_COMPILE)gcc
CFLAGS := -I../../usr/include
-PROGS := getdelays procacct
+PROGS := getdelays procacct delaytop
all: $(PROGS)
diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c
new file mode 100644
index 000000000000..9afb1ffc00ba
--- /dev/null
+++ b/tools/accounting/delaytop.c
@@ -0,0 +1,862 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * delaytop.c - system-wide delay monitoring tool.
+ *
+ * This tool provides real-time monitoring and statistics of
+ * system, container, and task-level delays, including CPU,
+ * memory, IO, and IRQ. It supports both interactive (top-like),
+ * and can output delay information for the whole system, specific
+ * containers (cgroups), or individual tasks (PIDs).
+ *
+ * Key features:
+ * - Collects per-task delay accounting statistics via taskstats.
+ * - Collects system-wide PSI information.
+ * - Supports sorting, filtering.
+ * - Supports both interactive (screen refresh).
+ *
+ * Copyright (C) Fan Yu, ZTE Corp. 2025
+ * Copyright (C) Wang Yaxin, ZTE Corp. 2025
+ *
+ * Compile with
+ * gcc -I/usr/src/linux/include delaytop.c -o delaytop
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <signal.h>
+#include <time.h>
+#include <dirent.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <termios.h>
+#include <limits.h>
+#include <linux/genetlink.h>
+#include <linux/taskstats.h>
+#include <linux/cgroupstats.h>
+
+#define PSI_CPU_SOME "/proc/pressure/cpu"
+#define PSI_CPU_FULL "/proc/pressure/cpu"
+#define PSI_MEMORY_SOME "/proc/pressure/memory"
+#define PSI_MEMORY_FULL "/proc/pressure/memory"
+#define PSI_IO_SOME "/proc/pressure/io"
+#define PSI_IO_FULL "/proc/pressure/io"
+#define PSI_IRQ_FULL "/proc/pressure/irq"
+
+#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
+#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
+
+#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+
+#define TASK_COMM_LEN 16
+#define MAX_MSG_SIZE 1024
+#define MAX_TASKS 1000
+#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
+#define BOOL_FPRINT(stream, fmt, ...) \
+({ \
+ int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
+ ret >= 0; \
+})
+#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
+
+/* Program settings structure */
+struct config {
+ int delay; /* Update interval in seconds */
+ int iterations; /* Number of iterations, 0 == infinite */
+ int max_processes; /* Maximum number of processes to show */
+ char sort_field; /* Field to sort by */
+ int output_one_time; /* Output once and exit */
+ int monitor_pid; /* Monitor specific PID */
+ char *container_path; /* Path to container cgroup */
+};
+
+/* PSI statistics structure */
+struct psi_stats {
+ double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
+ unsigned long long cpu_some_total;
+ double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
+ unsigned long long cpu_full_total;
+ double memory_some_avg10, memory_some_avg60, memory_some_avg300;
+ unsigned long long memory_some_total;
+ double memory_full_avg10, memory_full_avg60, memory_full_avg300;
+ unsigned long long memory_full_total;
+ double io_some_avg10, io_some_avg60, io_some_avg300;
+ unsigned long long io_some_total;
+ double io_full_avg10, io_full_avg60, io_full_avg300;
+ unsigned long long io_full_total;
+ double irq_full_avg10, irq_full_avg60, irq_full_avg300;
+ unsigned long long irq_full_total;
+};
+
+/* Task delay information structure */
+struct task_info {
+ int pid;
+ int tgid;
+ char command[TASK_COMM_LEN];
+ unsigned long long cpu_count;
+ unsigned long long cpu_delay_total;
+ unsigned long long blkio_count;
+ unsigned long long blkio_delay_total;
+ unsigned long long swapin_count;
+ unsigned long long swapin_delay_total;
+ unsigned long long freepages_count;
+ unsigned long long freepages_delay_total;
+ unsigned long long thrashing_count;
+ unsigned long long thrashing_delay_total;
+ unsigned long long compact_count;
+ unsigned long long compact_delay_total;
+ unsigned long long wpcopy_count;
+ unsigned long long wpcopy_delay_total;
+ unsigned long long irq_count;
+ unsigned long long irq_delay_total;
+};
+
+/* Container statistics structure */
+struct container_stats {
+ int nr_sleeping; /* Number of sleeping processes */
+ int nr_running; /* Number of running processes */
+ int nr_stopped; /* Number of stopped processes */
+ int nr_uninterruptible; /* Number of uninterruptible processes */
+ int nr_io_wait; /* Number of processes in IO wait */
+};
+
+/* Global variables */
+static struct config cfg;
+static struct psi_stats psi;
+static struct task_info tasks[MAX_TASKS];
+static int task_count;
+static int running = 1;
+static struct container_stats container_stats;
+
+/* Netlink socket variables */
+static int nl_sd = -1;
+static int family_id;
+
+/* Set terminal to non-canonical mode for q-to-quit */
+static struct termios orig_termios;
+static void enable_raw_mode(void)
+{
+ struct termios raw;
+
+ tcgetattr(STDIN_FILENO, &orig_termios);
+ raw = orig_termios;
+ raw.c_lflag &= ~(ICANON | ECHO);
+ tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
+}
+static void disable_raw_mode(void)
+{
+ tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
+}
+
+/* Display usage information and command line options */
+static void usage(void)
+{
+ printf("Usage: delaytop [Options]\n"
+ "Options:\n"
+ " -h, --help Show this help message and exit\n"
+ " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n"
+ " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n"
+ " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n"
+ " -o, --once Display once and exit\n"
+ " -p, --pid=PID Monitor only the specified PID\n"
+ " -C, --container=PATH Monitor the container at specified cgroup path\n");
+ exit(0);
+}
+
+/* Parse command line arguments and set configuration */
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"delay", required_argument, 0, 'd'},
+ {"iterations", required_argument, 0, 'n'},
+ {"pid", required_argument, 0, 'p'},
+ {"once", no_argument, 0, 'o'},
+ {"processes", required_argument, 0, 'P'},
+ {"container", required_argument, 0, 'C'},
+ {0, 0, 0, 0}
+ };
+
+ /* Set defaults */
+ cfg.delay = 2;
+ cfg.iterations = 0;
+ cfg.max_processes = 20;
+ cfg.sort_field = 'c'; /* Default sort by CPU delay */
+ cfg.output_one_time = 0;
+ cfg.monitor_pid = 0; /* 0 means monitor all PIDs */
+ cfg.container_path = NULL;
+
+ while (1) {
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index);
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ usage();
+ break;
+ case 'd':
+ cfg.delay = atoi(optarg);
+ if (cfg.delay < 1) {
+ fprintf(stderr, "Error: delay must be >= 1.\n");
+ exit(1);
+ }
+ break;
+ case 'n':
+ cfg.iterations = atoi(optarg);
+ if (cfg.iterations < 0) {
+ fprintf(stderr, "Error: iterations must be >= 0.\n");
+ exit(1);
+ }
+ break;
+ case 'p':
+ cfg.monitor_pid = atoi(optarg);
+ if (cfg.monitor_pid < 1) {
+ fprintf(stderr, "Error: pid must be >= 1.\n");
+ exit(1);
+ }
+ break;
+ case 'o':
+ cfg.output_one_time = 1;
+ break;
+ case 'P':
+ cfg.max_processes = atoi(optarg);
+ if (cfg.max_processes < 1) {
+ fprintf(stderr, "Error: processes must be >= 1.\n");
+ exit(1);
+ }
+ if (cfg.max_processes > MAX_TASKS) {
+ fprintf(stderr, "Warning: processes capped to %d.\n",
+ MAX_TASKS);
+ cfg.max_processes = MAX_TASKS;
+ }
+ break;
+ case 'C':
+ cfg.container_path = strdup(optarg);
+ break;
+ default:
+ fprintf(stderr, "Try 'delaytop --help' for more information.\n");
+ exit(1);
+ }
+ }
+}
+
+/* Create a raw netlink socket and bind */
+static int create_nl_socket(void)
+{
+ int fd;
+ struct sockaddr_nl local;
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ if (fd < 0)
+ return -1;
+
+ memset(&local, 0, sizeof(local));
+ local.nl_family = AF_NETLINK;
+
+ if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
+ fprintf(stderr, "Failed to bind socket when create nl_socket\n");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+/* Send a command via netlink */
+static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+ __u8 genl_cmd, __u16 nla_type,
+ void *nla_data, int nla_len)
+{
+ struct sockaddr_nl nladdr;
+ struct nlattr *na;
+ int r, buflen;
+ char *buf;
+
+ struct {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[MAX_MSG_SIZE];
+ } msg;
+
+ msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+ msg.n.nlmsg_type = nlmsg_type;
+ msg.n.nlmsg_flags = NLM_F_REQUEST;
+ msg.n.nlmsg_seq = 0;
+ msg.n.nlmsg_pid = nlmsg_pid;
+ msg.g.cmd = genl_cmd;
+ msg.g.version = 0x1;
+ na = (struct nlattr *) GENLMSG_DATA(&msg);
+ na->nla_type = nla_type;
+ na->nla_len = nla_len + NLA_HDRLEN;
+ memcpy(NLA_DATA(na), nla_data, nla_len);
+ msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+ buf = (char *) &msg;
+ buflen = msg.n.nlmsg_len;
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+ sizeof(nladdr))) < buflen) {
+ if (r > 0) {
+ buf += r;
+ buflen -= r;
+ } else if (errno != EAGAIN)
+ return -1;
+ }
+ return 0;
+}
+
+/* Get family ID for taskstats via netlink */
+static int get_family_id(int sd)
+{
+ struct {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[256];
+ } ans;
+
+ int id = 0, rc;
+ struct nlattr *na;
+ int rep_len;
+ char name[100];
+
+ strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
+ name[sizeof(name) - 1] = '\0';
+ rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+ CTRL_ATTR_FAMILY_NAME, (void *)name,
+ strlen(TASKSTATS_GENL_NAME)+1);
+ if (rc < 0) {
+ fprintf(stderr, "Failed to send cmd for family id\n");
+ return 0;
+ }
+
+ rep_len = recv(sd, &ans, sizeof(ans), 0);
+ if (ans.n.nlmsg_type == NLMSG_ERROR ||
+ (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
+ fprintf(stderr, "Failed to receive response for family id\n");
+ return 0;
+ }
+
+ na = (struct nlattr *) GENLMSG_DATA(&ans);
+ na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+ if (na->nla_type == CTRL_ATTR_FAMILY_ID)
+ id = *(__u16 *) NLA_DATA(na);
+ return id;
+}
+
+static void read_psi_stats(void)
+{
+ FILE *fp;
+ char line[256];
+ int ret = 0;
+ /* Zero all fields */
+ memset(&psi, 0, sizeof(psi));
+ /* CPU pressure */
+ fp = fopen(PSI_CPU_SOME, "r");
+ if (fp) {
+ while (fgets(line, sizeof(line), fp)) {
+ if (strncmp(line, "some", 4) == 0) {
+ ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.cpu_some_avg10, &psi.cpu_some_avg60,
+ &psi.cpu_some_avg300, &psi.cpu_some_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse CPU some PSI data\n");
+ } else if (strncmp(line, "full", 4) == 0) {
+ ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.cpu_full_avg10, &psi.cpu_full_avg60,
+ &psi.cpu_full_avg300, &psi.cpu_full_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse CPU full PSI data\n");
+ }
+ }
+ fclose(fp);
+ }
+ /* Memory pressure */
+ fp = fopen(PSI_MEMORY_SOME, "r");
+ if (fp) {
+ while (fgets(line, sizeof(line), fp)) {
+ if (strncmp(line, "some", 4) == 0) {
+ ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.memory_some_avg10, &psi.memory_some_avg60,
+ &psi.memory_some_avg300, &psi.memory_some_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse Memory some PSI data\n");
+ } else if (strncmp(line, "full", 4) == 0) {
+ ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.memory_full_avg10, &psi.memory_full_avg60,
+ &psi.memory_full_avg300, &psi.memory_full_total);
+ }
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse Memory full PSI data\n");
+ }
+ fclose(fp);
+ }
+ /* IO pressure */
+ fp = fopen(PSI_IO_SOME, "r");
+ if (fp) {
+ while (fgets(line, sizeof(line), fp)) {
+ if (strncmp(line, "some", 4) == 0) {
+ ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.io_some_avg10, &psi.io_some_avg60,
+ &psi.io_some_avg300, &psi.io_some_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse IO some PSI data\n");
+ } else if (strncmp(line, "full", 4) == 0) {
+ ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.io_full_avg10, &psi.io_full_avg60,
+ &psi.io_full_avg300, &psi.io_full_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse IO full PSI data\n");
+ }
+ }
+ fclose(fp);
+ }
+ /* IRQ pressure (only full) */
+ fp = fopen(PSI_IRQ_FULL, "r");
+ if (fp) {
+ while (fgets(line, sizeof(line), fp)) {
+ if (strncmp(line, "full", 4) == 0) {
+ ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
+ &psi.irq_full_avg10, &psi.irq_full_avg60,
+ &psi.irq_full_avg300, &psi.irq_full_total);
+ if (ret != 4)
+ fprintf(stderr, "Failed to parse IRQ full PSI data\n");
+ }
+ }
+ fclose(fp);
+ }
+}
+
+static int read_comm(int pid, char *comm_buf, size_t buf_size)
+{
+ char path[64];
+ int ret = -1;
+ size_t len;
+ FILE *fp;
+
+ snprintf(path, sizeof(path), "/proc/%d/comm", pid);
+ fp = fopen(path, "r");
+ if (!fp) {
+ fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
+ return ret;
+ }
+
+ if (fgets(comm_buf, buf_size, fp)) {
+ len = strlen(comm_buf);
+ if (len > 0 && comm_buf[len - 1] == '\n')
+ comm_buf[len - 1] = '\0';
+ ret = 0;
+ }
+
+ fclose(fp);
+
+ return ret;
+}
+
+static void fetch_and_fill_task_info(int pid, const char *comm)
+{
+ struct {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[MAX_MSG_SIZE];
+ } resp;
+ struct taskstats stats;
+ struct nlattr *nested;
+ struct nlattr *na;
+ int nested_len;
+ int nl_len;
+ int rc;
+
+ /* Send request for task stats */
+ if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
+ TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
+ fprintf(stderr, "Failed to send request for task stats\n");
+ return;
+ }
+
+ /* Receive response */
+ rc = recv(nl_sd, &resp, sizeof(resp), 0);
+ if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
+ fprintf(stderr, "Failed to receive response for task stats\n");
+ return;
+ }
+
+ /* Parse response */
+ nl_len = GENLMSG_PAYLOAD(&resp.n);
+ na = (struct nlattr *) GENLMSG_DATA(&resp);
+ while (nl_len > 0) {
+ if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
+ nested = (struct nlattr *) NLA_DATA(na);
+ nested_len = NLA_PAYLOAD(na->nla_len);
+ while (nested_len > 0) {
+ if (nested->nla_type == TASKSTATS_TYPE_STATS) {
+ memcpy(&stats, NLA_DATA(nested), sizeof(stats));
+ if (task_count < MAX_TASKS) {
+ tasks[task_count].pid = pid;
+ tasks[task_count].tgid = pid;
+ strncpy(tasks[task_count].command, comm,
+ TASK_COMM_LEN - 1);
+ tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
+ SET_TASK_STAT(task_count, cpu_count);
+ SET_TASK_STAT(task_count, cpu_delay_total);
+ SET_TASK_STAT(task_count, blkio_count);
+ SET_TASK_STAT(task_count, blkio_delay_total);
+ SET_TASK_STAT(task_count, swapin_count);
+ SET_TASK_STAT(task_count, swapin_delay_total);
+ SET_TASK_STAT(task_count, freepages_count);
+ SET_TASK_STAT(task_count, freepages_delay_total);
+ SET_TASK_STAT(task_count, thrashing_count);
+ SET_TASK_STAT(task_count, thrashing_delay_total);
+ SET_TASK_STAT(task_count, compact_count);
+ SET_TASK_STAT(task_count, compact_delay_total);
+ SET_TASK_STAT(task_count, wpcopy_count);
+ SET_TASK_STAT(task_count, wpcopy_delay_total);
+ SET_TASK_STAT(task_count, irq_count);
+ SET_TASK_STAT(task_count, irq_delay_total);
+ task_count++;
+ }
+ break;
+ }
+ nested_len -= NLA_ALIGN(nested->nla_len);
+ nested = NLA_NEXT(nested);
+ }
+ }
+ nl_len -= NLA_ALIGN(na->nla_len);
+ na = NLA_NEXT(na);
+ }
+ return;
+}
+
+static void get_task_delays(void)
+{
+ char comm[TASK_COMM_LEN];
+ struct dirent *entry;
+ DIR *dir;
+ int pid;
+
+ task_count = 0;
+ if (cfg.monitor_pid > 0) {
+ if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
+ fetch_and_fill_task_info(cfg.monitor_pid, comm);
+ return;
+ }
+
+ dir = opendir("/proc");
+ if (!dir) {
+ fprintf(stderr, "Error opening /proc directory\n");
+ return;
+ }
+
+ while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
+ if (!isdigit(entry->d_name[0]))
+ continue;
+ pid = atoi(entry->d_name);
+ if (pid == 0)
+ continue;
+ if (read_comm(pid, comm, sizeof(comm)) != 0)
+ continue;
+ fetch_and_fill_task_info(pid, comm);
+ }
+ closedir(dir);
+}
+
+/* Calculate average delay in milliseconds */
+static double average_ms(unsigned long long total, unsigned long long count)
+{
+ if (count == 0)
+ return 0;
+ return (double)total / 1000000.0 / count;
+}
+
+/* Comparison function for sorting tasks */
+static int compare_tasks(const void *a, const void *b)
+{
+ const struct task_info *t1 = (const struct task_info *)a;
+ const struct task_info *t2 = (const struct task_info *)b;
+ double avg1, avg2;
+
+ switch (cfg.sort_field) {
+ case 'c': /* CPU */
+ avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count);
+ avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count);
+ if (avg1 != avg2)
+ return avg2 > avg1 ? 1 : -1;
+ return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
+
+ default:
+ return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
+ }
+}
+
+/* Sort tasks by selected field */
+static void sort_tasks(void)
+{
+ if (task_count > 0)
+ qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
+}
+
+/* Get container statistics via cgroupstats */
+static void get_container_stats(void)
+{
+ int rc, cfd;
+ struct {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[MAX_MSG_SIZE];
+ } req, resp;
+ struct nlattr *na;
+ int nl_len;
+ struct cgroupstats stats;
+
+ /* Check if container path is set */
+ if (!cfg.container_path)
+ return;
+
+ /* Open container cgroup */
+ cfd = open(cfg.container_path, O_RDONLY);
+ if (cfd < 0) {
+ fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
+ return;
+ }
+
+ /* Send request for container stats */
+ if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
+ CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
+ fprintf(stderr, "Failed to send request for container stats\n");
+ close(cfd);
+ return;
+ }
+
+ /* Receive response */
+ rc = recv(nl_sd, &resp, sizeof(resp), 0);
+ if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
+ fprintf(stderr, "Failed to receive response for container stats\n");
+ close(cfd);
+ return;
+ }
+
+ /* Parse response */
+ nl_len = GENLMSG_PAYLOAD(&resp.n);
+ na = (struct nlattr *) GENLMSG_DATA(&resp);
+ while (nl_len > 0) {
+ if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
+ /* Get the cgroupstats structure */
+ memcpy(&stats, NLA_DATA(na), sizeof(stats));
+
+ /* Fill container stats */
+ container_stats.nr_sleeping = stats.nr_sleeping;
+ container_stats.nr_running = stats.nr_running;
+ container_stats.nr_stopped = stats.nr_stopped;
+ container_stats.nr_uninterruptible = stats.nr_uninterruptible;
+ container_stats.nr_io_wait = stats.nr_io_wait;
+ break;
+ }
+ nl_len -= NLA_ALIGN(na->nla_len);
+ na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+ }
+
+ close(cfd);
+}
+
+/* Display results to stdout or log file */
+static void display_results(void)
+{
+ time_t now = time(NULL);
+ struct tm *tm_now = localtime(&now);
+ FILE *out = stdout;
+ char timestamp[32];
+ bool suc = true;
+ int i, count;
+
+ /* Clear terminal screen */
+ suc &= BOOL_FPRINT(out, "\033[H\033[J");
+
+ /* PSI output (one-line, no cat style) */
+ suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "CPU some:",
+ psi.cpu_some_avg10,
+ psi.cpu_some_avg60,
+ psi.cpu_some_avg300,
+ psi.cpu_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "CPU full:",
+ psi.cpu_full_avg10,
+ psi.cpu_full_avg60,
+ psi.cpu_full_avg300,
+ psi.cpu_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "Memory full:",
+ psi.memory_full_avg10,
+ psi.memory_full_avg60,
+ psi.memory_full_avg300,
+ psi.memory_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "Memory some:",
+ psi.memory_some_avg10,
+ psi.memory_some_avg60,
+ psi.memory_some_avg300,
+ psi.memory_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IO full:",
+ psi.io_full_avg10,
+ psi.io_full_avg60,
+ psi.io_full_avg300,
+ psi.io_full_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IO some:",
+ psi.io_some_avg10,
+ psi.io_some_avg60,
+ psi.io_some_avg300,
+ psi.io_some_total / 1000);
+ suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
+ "IRQ full:",
+ psi.irq_full_avg10,
+ psi.irq_full_avg60,
+ psi.irq_full_avg300,
+ psi.irq_full_total / 1000);
+
+ if (cfg.container_path) {
+ suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
+ suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
+ container_stats.nr_running, container_stats.nr_sleeping);
+ suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
+ container_stats.nr_stopped, container_stats.nr_uninterruptible,
+ container_stats.nr_io_wait);
+ }
+ suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n",
+ cfg.max_processes);
+ suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND");
+ suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n",
+ "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)",
+ "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)");
+
+ suc &= BOOL_FPRINT(out, "-----------------------------------------------");
+ suc &= BOOL_FPRINT(out, "----------------------------------------------\n");
+ count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
+
+ for (i = 0; i < count; i++) {
+ suc &= BOOL_FPRINT(out, "%5d %5d %-15s",
+ tasks[i].pid, tasks[i].tgid, tasks[i].command);
+ suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n",
+ average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count),
+ average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count),
+ average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count),
+ average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count),
+ average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count),
+ average_ms(tasks[i].compact_delay_total, tasks[i].compact_count),
+ average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count),
+ average_ms(tasks[i].irq_delay_total, tasks[i].irq_count));
+ }
+
+ suc &= BOOL_FPRINT(out, "\n");
+
+ if (!suc)
+ perror("Error writing to output");
+}
+
+/* Main function */
+int main(int argc, char **argv)
+{
+ int iterations = 0;
+ int use_q_quit = 0;
+
+ /* Parse command line arguments */
+ parse_args(argc, argv);
+
+ /* Setup netlink socket */
+ nl_sd = create_nl_socket();
+ if (nl_sd < 0) {
+ fprintf(stderr, "Error creating netlink socket\n");
+ exit(1);
+ }
+
+ /* Get family ID for taskstats via netlink */
+ family_id = get_family_id(nl_sd);
+ if (!family_id) {
+ fprintf(stderr, "Error getting taskstats family ID\n");
+ close(nl_sd);
+ exit(1);
+ }
+
+ if (!cfg.output_one_time) {
+ use_q_quit = 1;
+ enable_raw_mode();
+ printf("Press 'q' to quit.\n");
+ fflush(stdout);
+ }
+
+ /* Main loop */
+ while (running) {
+ /* Read PSI statistics */
+ read_psi_stats();
+
+ /* Get container stats if container path provided */
+ if (cfg.container_path)
+ get_container_stats();
+
+ /* Get task delays */
+ get_task_delays();
+
+ /* Sort tasks */
+ sort_tasks();
+
+ /* Display results to stdout or log file */
+ display_results();
+
+ /* Check for iterations */
+ if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
+ break;
+
+ /* Exit if output_one_time is set */
+ if (cfg.output_one_time)
+ break;
+
+ /* Check for 'q' key to quit */
+ if (use_q_quit) {
+ struct timeval tv = {cfg.delay, 0};
+ fd_set readfds;
+
+ FD_ZERO(&readfds);
+ FD_SET(STDIN_FILENO, &readfds);
+ int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv);
+
+ if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
+ char ch = 0;
+
+ read(STDIN_FILENO, &ch, 1);
+ if (ch == 'q' || ch == 'Q') {
+ running = 0;
+ break;
+ }
+ }
+ } else {
+ sleep(cfg.delay);
+ }
+ }
+
+ /* Restore terminal mode */
+ if (use_q_quit)
+ disable_raw_mode();
+
+ /* Cleanup */
+ close(nl_sd);
+ if (cfg.container_path)
+ free(cfg.container_path);
+
+ return 0;
+}
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 3feac0482fe9..21cb3c3d1331 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -194,75 +194,108 @@ static int get_family_id(int sd)
#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
#define delay_ms(t) (t / 1000000ULL)
+/*
+ * Version compatibility note:
+ * Field availability depends on taskstats version (t->version),
+ * corresponding to TASKSTATS_VERSION in kernel headers
+ * see include/uapi/linux/taskstats.h
+ *
+ * Version feature mapping:
+ * version >= 11 - supports COMPACT statistics
+ * version >= 13 - supports WPCOPY statistics
+ * version >= 14 - supports IRQ statistics
+ * version >= 16 - supports *_max and *_min delay statistics
+ *
+ * Always verify version before accessing version-dependent fields
+ * to maintain backward compatibility.
+ */
+#define PRINT_CPU_DELAY(version, t) \
+ do { \
+ if (version >= 16) { \
+ printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
+ "CPU", "count", "real total", "virtual total", \
+ "delay total", "delay average", "delay max", "delay min"); \
+ printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+ (unsigned long long)(t)->cpu_count, \
+ (unsigned long long)(t)->cpu_run_real_total, \
+ (unsigned long long)(t)->cpu_run_virtual_total, \
+ (unsigned long long)(t)->cpu_delay_total, \
+ average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
+ delay_ms((double)(t)->cpu_delay_max), \
+ delay_ms((double)(t)->cpu_delay_min)); \
+ } else { \
+ printf("%-10s%15s%15s%15s%15s%15s\n", \
+ "CPU", "count", "real total", "virtual total", \
+ "delay total", "delay average"); \
+ printf(" %15llu%15llu%15llu%15llu%15.3fms\n", \
+ (unsigned long long)(t)->cpu_count, \
+ (unsigned long long)(t)->cpu_run_real_total, \
+ (unsigned long long)(t)->cpu_run_virtual_total, \
+ (unsigned long long)(t)->cpu_delay_total, \
+ average_ms((double)(t)->cpu_delay_total, (t)->cpu_count)); \
+ } \
+ } while (0)
+#define PRINT_FILED_DELAY(name, version, t, count, total, max, min) \
+ do { \
+ if (version >= 16) { \
+ printf("%-10s%15s%15s%15s%15s%15s\n", \
+ name, "count", "delay total", "delay average", \
+ "delay max", "delay min"); \
+ printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+ (unsigned long long)(t)->count, \
+ (unsigned long long)(t)->total, \
+ average_ms((double)(t)->total, (t)->count), \
+ delay_ms((double)(t)->max), \
+ delay_ms((double)(t)->min)); \
+ } else { \
+ printf("%-10s%15s%15s%15s\n", \
+ name, "count", "delay total", "delay average"); \
+ printf(" %15llu%15llu%15.3fms\n", \
+ (unsigned long long)(t)->count, \
+ (unsigned long long)(t)->total, \
+ average_ms((double)(t)->total, (t)->count)); \
+ } \
+ } while (0)
+
static void print_delayacct(struct taskstats *t)
{
- printf("\n\nCPU %15s%15s%15s%15s%15s%15s%15s\n"
- " %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "IO %15s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "SWAP %15s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "RECLAIM %12s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "THRASHING%12s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "COMPACT %12s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "WPCOPY %12s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n"
- "IRQ %15s%15s%15s%15s%15s\n"
- " %15llu%15llu%15.3fms%13.6fms%13.6fms\n",
- "count", "real total", "virtual total",
- "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->cpu_count,
- (unsigned long long)t->cpu_run_real_total,
- (unsigned long long)t->cpu_run_virtual_total,
- (unsigned long long)t->cpu_delay_total,
- average_ms((double)t->cpu_delay_total, t->cpu_count),
- delay_ms((double)t->cpu_delay_max),
- delay_ms((double)t->cpu_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->blkio_count,
- (unsigned long long)t->blkio_delay_total,
- average_ms((double)t->blkio_delay_total, t->blkio_count),
- delay_ms((double)t->blkio_delay_max),
- delay_ms((double)t->blkio_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->swapin_count,
- (unsigned long long)t->swapin_delay_total,
- average_ms((double)t->swapin_delay_total, t->swapin_count),
- delay_ms((double)t->swapin_delay_max),
- delay_ms((double)t->swapin_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->freepages_count,
- (unsigned long long)t->freepages_delay_total,
- average_ms((double)t->freepages_delay_total, t->freepages_count),
- delay_ms((double)t->freepages_delay_max),
- delay_ms((double)t->freepages_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->thrashing_count,
- (unsigned long long)t->thrashing_delay_total,
- average_ms((double)t->thrashing_delay_total, t->thrashing_count),
- delay_ms((double)t->thrashing_delay_max),
- delay_ms((double)t->thrashing_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->compact_count,
- (unsigned long long)t->compact_delay_total,
- average_ms((double)t->compact_delay_total, t->compact_count),
- delay_ms((double)t->compact_delay_max),
- delay_ms((double)t->compact_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->wpcopy_count,
- (unsigned long long)t->wpcopy_delay_total,
- average_ms((double)t->wpcopy_delay_total, t->wpcopy_count),
- delay_ms((double)t->wpcopy_delay_max),
- delay_ms((double)t->wpcopy_delay_min),
- "count", "delay total", "delay average", "delay max", "delay min",
- (unsigned long long)t->irq_count,
- (unsigned long long)t->irq_delay_total,
- average_ms((double)t->irq_delay_total, t->irq_count),
- delay_ms((double)t->irq_delay_max),
- delay_ms((double)t->irq_delay_min));
+ printf("\n\n");
+
+ PRINT_CPU_DELAY(t->version, t);
+
+ PRINT_FILED_DELAY("IO", t->version, t,
+ blkio_count, blkio_delay_total,
+ blkio_delay_max, blkio_delay_min);
+
+ PRINT_FILED_DELAY("SWAP", t->version, t,
+ swapin_count, swapin_delay_total,
+ swapin_delay_max, swapin_delay_min);
+
+ PRINT_FILED_DELAY("RECLAIM", t->version, t,
+ freepages_count, freepages_delay_total,
+ freepages_delay_max, freepages_delay_min);
+
+ PRINT_FILED_DELAY("THRASHING", t->version, t,
+ thrashing_count, thrashing_delay_total,
+ thrashing_delay_max, thrashing_delay_min);
+
+ if (t->version >= 11) {
+ PRINT_FILED_DELAY("COMPACT", t->version, t,
+ compact_count, compact_delay_total,
+ compact_delay_max, compact_delay_min);
+ }
+
+ if (t->version >= 13) {
+ PRINT_FILED_DELAY("WPCOPY", t->version, t,
+ wpcopy_count, wpcopy_delay_total,
+ wpcopy_delay_max, wpcopy_delay_min);
+ }
+
+ if (t->version >= 14) {
+ PRINT_FILED_DELAY("IRQ", t->version, t,
+ irq_count, irq_delay_total,
+ irq_delay_max, irq_delay_min);
+ }
}
static void task_context_switch_counts(struct taskstats *t)
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index fb11108aaf42..3340def58d01 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -47,10 +47,11 @@ name as necessary to disambiguate it from others is necessary. Note that option
MSRs are read as 64-bits, u32 truncates the displayed value to 32-bits.
default: u64
- format: {\fBraw\fP | \fBdelta\fP | \fBpercent\fP}
+ format: {\fBraw\fP | \fBdelta\fP | \fBpercent\fP | \fBaverage\fP}
'raw' shows the MSR contents in hex.
'delta' shows the difference in values during the measurement interval.
'percent' shows the delta as a percentage of the cycles elapsed.
+ 'average' similar to raw, but also averaged for node/package summaries (or when using -S).
default: delta
name: "name_string"
@@ -186,6 +187,14 @@ The system configuration dump (if --quiet is not used) is followed by statistics
.PP
\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used.
.PP
+\fBTotl%C0\fP Weighted percentage of time that CPUs are busy. If N CPUs are busy during an interval, the percentage is N * 100%.
+.PP
+\fBAny%C0\fP Percentage of time that at least one CPU is busy.
+.PP
+\fBGFX%C0\fP Percentage of time that at least one GFX compute engine is busy.
+.PP
+\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute enginer is busy.
+.PP
\fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters.
.PP
\fBPkgWatt\fP Watts consumed by the whole package.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5230e072e414..72a280e7a9d5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -67,6 +67,7 @@
#include <stdbool.h>
#include <assert.h>
#include <linux/kernel.h>
+#include <limits.h>
#define UNUSED(x) (void)(x)
@@ -194,6 +195,7 @@ struct msr_counter bic[] = {
{ 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "Die", NULL, 0, 0, 0, NULL, 0 },
+ { 0x0, "L3", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 },
@@ -209,91 +211,238 @@ struct msr_counter bic[] = {
{ 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 },
};
-#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
-#define BIC_USEC (1ULL << 0)
-#define BIC_TOD (1ULL << 1)
-#define BIC_Package (1ULL << 2)
-#define BIC_Node (1ULL << 3)
-#define BIC_Avg_MHz (1ULL << 4)
-#define BIC_Busy (1ULL << 5)
-#define BIC_Bzy_MHz (1ULL << 6)
-#define BIC_TSC_MHz (1ULL << 7)
-#define BIC_IRQ (1ULL << 8)
-#define BIC_SMI (1ULL << 9)
-#define BIC_cpuidle (1ULL << 10)
-#define BIC_CPU_c1 (1ULL << 11)
-#define BIC_CPU_c3 (1ULL << 12)
-#define BIC_CPU_c6 (1ULL << 13)
-#define BIC_CPU_c7 (1ULL << 14)
-#define BIC_ThreadC (1ULL << 15)
-#define BIC_CoreTmp (1ULL << 16)
-#define BIC_CoreCnt (1ULL << 17)
-#define BIC_PkgTmp (1ULL << 18)
-#define BIC_GFX_rc6 (1ULL << 19)
-#define BIC_GFXMHz (1ULL << 20)
-#define BIC_Pkgpc2 (1ULL << 21)
-#define BIC_Pkgpc3 (1ULL << 22)
-#define BIC_Pkgpc6 (1ULL << 23)
-#define BIC_Pkgpc7 (1ULL << 24)
-#define BIC_Pkgpc8 (1ULL << 25)
-#define BIC_Pkgpc9 (1ULL << 26)
-#define BIC_Pkgpc10 (1ULL << 27)
-#define BIC_CPU_LPI (1ULL << 28)
-#define BIC_SYS_LPI (1ULL << 29)
-#define BIC_PkgWatt (1ULL << 30)
-#define BIC_CorWatt (1ULL << 31)
-#define BIC_GFXWatt (1ULL << 32)
-#define BIC_PkgCnt (1ULL << 33)
-#define BIC_RAMWatt (1ULL << 34)
-#define BIC_PKG__ (1ULL << 35)
-#define BIC_RAM__ (1ULL << 36)
-#define BIC_Pkg_J (1ULL << 37)
-#define BIC_Cor_J (1ULL << 38)
-#define BIC_GFX_J (1ULL << 39)
-#define BIC_RAM_J (1ULL << 40)
-#define BIC_Mod_c6 (1ULL << 41)
-#define BIC_Totl_c0 (1ULL << 42)
-#define BIC_Any_c0 (1ULL << 43)
-#define BIC_GFX_c0 (1ULL << 44)
-#define BIC_CPUGFX (1ULL << 45)
-#define BIC_Core (1ULL << 46)
-#define BIC_CPU (1ULL << 47)
-#define BIC_APIC (1ULL << 48)
-#define BIC_X2APIC (1ULL << 49)
-#define BIC_Die (1ULL << 50)
-#define BIC_GFXACTMHz (1ULL << 51)
-#define BIC_IPC (1ULL << 52)
-#define BIC_CORE_THROT_CNT (1ULL << 53)
-#define BIC_UNCORE_MHZ (1ULL << 54)
-#define BIC_SAM_mc6 (1ULL << 55)
-#define BIC_SAMMHz (1ULL << 56)
-#define BIC_SAMACTMHz (1ULL << 57)
-#define BIC_Diec6 (1ULL << 58)
-#define BIC_SysWatt (1ULL << 59)
-#define BIC_Sys_J (1ULL << 60)
-#define BIC_NMI (1ULL << 61)
-#define BIC_CPU_c1e (1ULL << 62)
-#define BIC_pct_idle (1ULL << 63)
-
-#define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die)
-#define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt)
-#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
-#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
-#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle )
-#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle)
-#define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
-
-#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle)
-
-unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
-unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_cpuidle | BIC_pct_idle | BIC_APIC | BIC_X2APIC;
-
-#define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME)
-#define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME)
-#define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
-#define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
-#define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
-#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
+/* n.b. bic_names must match the order in bic[], above */
+enum bic_names {
+ BIC_USEC,
+ BIC_TOD,
+ BIC_Package,
+ BIC_Node,
+ BIC_Avg_MHz,
+ BIC_Busy,
+ BIC_Bzy_MHz,
+ BIC_TSC_MHz,
+ BIC_IRQ,
+ BIC_SMI,
+ BIC_cpuidle,
+ BIC_CPU_c1,
+ BIC_CPU_c3,
+ BIC_CPU_c6,
+ BIC_CPU_c7,
+ BIC_ThreadC,
+ BIC_CoreTmp,
+ BIC_CoreCnt,
+ BIC_PkgTmp,
+ BIC_GFX_rc6,
+ BIC_GFXMHz,
+ BIC_Pkgpc2,
+ BIC_Pkgpc3,
+ BIC_Pkgpc6,
+ BIC_Pkgpc7,
+ BIC_Pkgpc8,
+ BIC_Pkgpc9,
+ BIC_Pkgpc10,
+ BIC_CPU_LPI,
+ BIC_SYS_LPI,
+ BIC_PkgWatt,
+ BIC_CorWatt,
+ BIC_GFXWatt,
+ BIC_PkgCnt,
+ BIC_RAMWatt,
+ BIC_PKG__,
+ BIC_RAM__,
+ BIC_Pkg_J,
+ BIC_Cor_J,
+ BIC_GFX_J,
+ BIC_RAM_J,
+ BIC_Mod_c6,
+ BIC_Totl_c0,
+ BIC_Any_c0,
+ BIC_GFX_c0,
+ BIC_CPUGFX,
+ BIC_Core,
+ BIC_CPU,
+ BIC_APIC,
+ BIC_X2APIC,
+ BIC_Die,
+ BIC_L3,
+ BIC_GFXACTMHz,
+ BIC_IPC,
+ BIC_CORE_THROT_CNT,
+ BIC_UNCORE_MHZ,
+ BIC_SAM_mc6,
+ BIC_SAMMHz,
+ BIC_SAMACTMHz,
+ BIC_Diec6,
+ BIC_SysWatt,
+ BIC_Sys_J,
+ BIC_NMI,
+ BIC_CPU_c1e,
+ BIC_pct_idle,
+ MAX_BIC
+};
+
+void print_bic_set(char *s, cpu_set_t *set)
+{
+ int i;
+
+ assert(MAX_BIC < CPU_SETSIZE);
+
+ printf("%s:", s);
+
+ for (i = 0; i <= MAX_BIC; ++i) {
+
+ if (CPU_ISSET(i, set)) {
+ assert(i < MAX_BIC);
+ printf(" %s", bic[i].name);
+ }
+ }
+ putchar('\n');
+}
+
+static cpu_set_t bic_group_topology;
+static cpu_set_t bic_group_thermal_pwr;
+static cpu_set_t bic_group_frequency;
+static cpu_set_t bic_group_hw_idle;
+static cpu_set_t bic_group_sw_idle;
+static cpu_set_t bic_group_idle;
+static cpu_set_t bic_group_other;
+static cpu_set_t bic_group_disabled_by_default;
+static cpu_set_t bic_enabled;
+static cpu_set_t bic_present;
+
+/* modify */
+#define BIC_INIT(set) CPU_ZERO(set)
+
+#define SET_BIC(COUNTER_NUMBER, set) CPU_SET(COUNTER_NUMBER, set)
+#define CLR_BIC(COUNTER_NUMBER, set) CPU_CLR(COUNTER_NUMBER, set)
+
+#define BIC_PRESENT(COUNTER_NUMBER) SET_BIC(COUNTER_NUMBER, &bic_present)
+#define BIC_NOT_PRESENT(COUNTER_NUMBER) CPU_CLR(COUNTER_NUMBER, &bic_present)
+
+/* test */
+#define BIC_IS_ENABLED(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_enabled)
+#define DO_BIC_READ(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_present)
+#define DO_BIC(COUNTER_NUMBER) (CPU_ISSET(COUNTER_NUMBER, &bic_enabled) && CPU_ISSET(COUNTER_NUMBER, &bic_present))
+
+static void bic_set_all(cpu_set_t *set)
+{
+ int i;
+
+ assert(MAX_BIC < CPU_SETSIZE);
+
+ for (i = 0; i < MAX_BIC; ++i)
+ SET_BIC(i, set);
+}
+
+/*
+ * bic_clear_bits()
+ * clear all the bits from "clr" in "dst"
+ */
+static void bic_clear_bits(cpu_set_t *dst, cpu_set_t *clr)
+{
+ int i;
+
+ assert(MAX_BIC < CPU_SETSIZE);
+
+ for (i = 0; i < MAX_BIC; ++i)
+ if (CPU_ISSET(i, clr))
+ CLR_BIC(i, dst);
+}
+
+static void bic_groups_init(void)
+{
+ BIC_INIT(&bic_group_topology);
+ SET_BIC(BIC_Package, &bic_group_topology);
+ SET_BIC(BIC_Node, &bic_group_topology);
+ SET_BIC(BIC_CoreCnt, &bic_group_topology);
+ SET_BIC(BIC_PkgCnt, &bic_group_topology);
+ SET_BIC(BIC_Core, &bic_group_topology);
+ SET_BIC(BIC_CPU, &bic_group_topology);
+ SET_BIC(BIC_Die, &bic_group_topology);
+ SET_BIC(BIC_L3, &bic_group_topology);
+
+ BIC_INIT(&bic_group_thermal_pwr);
+ SET_BIC(BIC_CoreTmp, &bic_group_thermal_pwr);
+ SET_BIC(BIC_PkgTmp, &bic_group_thermal_pwr);
+ SET_BIC(BIC_PkgWatt, &bic_group_thermal_pwr);
+ SET_BIC(BIC_CorWatt, &bic_group_thermal_pwr);
+ SET_BIC(BIC_GFXWatt, &bic_group_thermal_pwr);
+ SET_BIC(BIC_RAMWatt, &bic_group_thermal_pwr);
+ SET_BIC(BIC_PKG__, &bic_group_thermal_pwr);
+ SET_BIC(BIC_RAM__, &bic_group_thermal_pwr);
+ SET_BIC(BIC_SysWatt, &bic_group_thermal_pwr);
+
+ BIC_INIT(&bic_group_frequency);
+ SET_BIC(BIC_Avg_MHz, &bic_group_frequency);
+ SET_BIC(BIC_Busy, &bic_group_frequency);
+ SET_BIC(BIC_Bzy_MHz, &bic_group_frequency);
+ SET_BIC(BIC_TSC_MHz, &bic_group_frequency);
+ SET_BIC(BIC_GFXMHz, &bic_group_frequency);
+ SET_BIC(BIC_GFXACTMHz, &bic_group_frequency);
+ SET_BIC(BIC_SAMMHz, &bic_group_frequency);
+ SET_BIC(BIC_SAMACTMHz, &bic_group_frequency);
+ SET_BIC(BIC_UNCORE_MHZ, &bic_group_frequency);
+
+ BIC_INIT(&bic_group_hw_idle);
+ SET_BIC(BIC_Busy, &bic_group_hw_idle);
+ SET_BIC(BIC_CPU_c1, &bic_group_hw_idle);
+ SET_BIC(BIC_CPU_c3, &bic_group_hw_idle);
+ SET_BIC(BIC_CPU_c6, &bic_group_hw_idle);
+ SET_BIC(BIC_CPU_c7, &bic_group_hw_idle);
+ SET_BIC(BIC_GFX_rc6, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc2, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc3, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc6, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc7, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc8, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc9, &bic_group_hw_idle);
+ SET_BIC(BIC_Pkgpc10, &bic_group_hw_idle);
+ SET_BIC(BIC_CPU_LPI, &bic_group_hw_idle);
+ SET_BIC(BIC_SYS_LPI, &bic_group_hw_idle);
+ SET_BIC(BIC_Mod_c6, &bic_group_hw_idle);
+ SET_BIC(BIC_Totl_c0, &bic_group_hw_idle);
+ SET_BIC(BIC_Any_c0, &bic_group_hw_idle);
+ SET_BIC(BIC_GFX_c0, &bic_group_hw_idle);
+ SET_BIC(BIC_CPUGFX, &bic_group_hw_idle);
+ SET_BIC(BIC_SAM_mc6, &bic_group_hw_idle);
+ SET_BIC(BIC_Diec6, &bic_group_hw_idle);
+
+ BIC_INIT(&bic_group_sw_idle);
+ SET_BIC(BIC_Busy, &bic_group_sw_idle);
+ SET_BIC(BIC_cpuidle, &bic_group_sw_idle);
+ SET_BIC(BIC_pct_idle, &bic_group_sw_idle);
+
+ BIC_INIT(&bic_group_idle);
+ CPU_OR(&bic_group_idle, &bic_group_idle, &bic_group_hw_idle);
+ SET_BIC(BIC_pct_idle, &bic_group_idle);
+
+ BIC_INIT(&bic_group_other);
+ SET_BIC(BIC_IRQ, &bic_group_other);
+ SET_BIC(BIC_NMI, &bic_group_other);
+ SET_BIC(BIC_SMI, &bic_group_other);
+ SET_BIC(BIC_ThreadC, &bic_group_other);
+ SET_BIC(BIC_CoreTmp, &bic_group_other);
+ SET_BIC(BIC_IPC, &bic_group_other);
+
+ BIC_INIT(&bic_group_disabled_by_default);
+ SET_BIC(BIC_USEC, &bic_group_disabled_by_default);
+ SET_BIC(BIC_TOD, &bic_group_disabled_by_default);
+ SET_BIC(BIC_cpuidle, &bic_group_disabled_by_default);
+ SET_BIC(BIC_APIC, &bic_group_disabled_by_default);
+ SET_BIC(BIC_X2APIC, &bic_group_disabled_by_default);
+
+ BIC_INIT(&bic_enabled);
+ bic_set_all(&bic_enabled);
+ bic_clear_bits(&bic_enabled, &bic_group_disabled_by_default);
+
+ BIC_INIT(&bic_present);
+ SET_BIC(BIC_USEC, &bic_present);
+ SET_BIC(BIC_TOD, &bic_present);
+ SET_BIC(BIC_cpuidle, &bic_present);
+ SET_BIC(BIC_APIC, &bic_present);
+ SET_BIC(BIC_X2APIC, &bic_present);
+ SET_BIC(BIC_pct_idle, &bic_present);
+}
/*
* MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
@@ -840,20 +989,21 @@ static const struct platform_features spr_features = {
};
static const struct platform_features dmr_features = {
- .has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control,
- .has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt,
- .has_nhm_msrs = spr_features.has_nhm_msrs,
- .has_config_tdp = spr_features.has_config_tdp,
- .bclk_freq = spr_features.bclk_freq,
- .supported_cstates = spr_features.supported_cstates,
- .cst_limit = spr_features.cst_limit,
- .has_msr_core_c1_res = spr_features.has_msr_core_c1_res,
- .has_msr_module_c6_res_ms = 1, /* DMR has Dual Core Module and MC6 MSR */
- .has_irtl_msrs = spr_features.has_irtl_msrs,
- .has_cst_prewake_bit = spr_features.has_cst_prewake_bit,
- .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit,
- .trl_msrs = spr_features.trl_msrs,
- .rapl_msrs = 0, /* DMR does not have RAPL MSRs */
+ .has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control,
+ .has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt,
+ .has_nhm_msrs = spr_features.has_nhm_msrs,
+ .bclk_freq = spr_features.bclk_freq,
+ .supported_cstates = spr_features.supported_cstates,
+ .cst_limit = spr_features.cst_limit,
+ .has_msr_core_c1_res = spr_features.has_msr_core_c1_res,
+ .has_cst_prewake_bit = spr_features.has_cst_prewake_bit,
+ .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit,
+ .trl_msrs = spr_features.trl_msrs,
+ .has_msr_module_c6_res_ms = 1, /* DMR has Dual-Core-Module and MC6 MSR */
+ .rapl_msrs = 0, /* DMR does not have RAPL MSRs */
+ .plr_msrs = 0, /* DMR does not have PLR MSRs */
+ .has_irtl_msrs = 0, /* DMR does not have IRTL MSRs */
+ .has_config_tdp = 0, /* DMR does not have CTDP MSRs */
};
static const struct platform_features srf_features = {
@@ -1204,7 +1354,7 @@ struct rapl_counter_arch_info {
int msr_shift; /* Positive mean shift right, negative mean shift left */
double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */
unsigned int rci_index; /* Maps data from perf counters to global variables */
- unsigned long long bic;
+ unsigned int bic_number;
double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */
unsigned long long flags;
};
@@ -1219,7 +1369,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_energy_units,
.rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
- .bic = BIC_PkgWatt | BIC_Pkg_J,
+ .bic_number = BIC_PkgWatt,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_PKG,
+ .perf_subsys = "power",
+ .perf_name = "energy-pkg",
+ .msr = MSR_PKG_ENERGY_STATUS,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_energy_units,
+ .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
+ .bic_number = BIC_Pkg_J,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1232,7 +1395,33 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_energy_units,
.rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
- .bic = BIC_PkgWatt | BIC_Pkg_J,
+ .bic_number = BIC_PkgWatt,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_AMD_F17H,
+ .perf_subsys = "power",
+ .perf_name = "energy-pkg",
+ .msr = MSR_PKG_ENERGY_STAT,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_energy_units,
+ .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
+ .bic_number = BIC_Pkg_J,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_CORE_ENERGY_STATUS,
+ .perf_subsys = "power",
+ .perf_name = "energy-cores",
+ .msr = MSR_PP0_ENERGY_STATUS,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_energy_units,
+ .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
+ .bic_number = BIC_CorWatt,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1245,7 +1434,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_energy_units,
.rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
- .bic = BIC_CorWatt | BIC_Cor_J,
+ .bic_number = BIC_Cor_J,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1258,7 +1447,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_dram_energy_units,
.rci_index = RAPL_RCI_INDEX_DRAM,
- .bic = BIC_RAMWatt | BIC_RAM_J,
+ .bic_number = BIC_RAMWatt,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_DRAM,
+ .perf_subsys = "power",
+ .perf_name = "energy-ram",
+ .msr = MSR_DRAM_ENERGY_STATUS,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_dram_energy_units,
+ .rci_index = RAPL_RCI_INDEX_DRAM,
+ .bic_number = BIC_RAM_J,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1271,7 +1473,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_energy_units,
.rci_index = RAPL_RCI_INDEX_GFX,
- .bic = BIC_GFXWatt | BIC_GFX_J,
+ .bic_number = BIC_GFXWatt,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_GFX,
+ .perf_subsys = "power",
+ .perf_name = "energy-gpu",
+ .msr = MSR_PP1_ENERGY_STATUS,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_energy_units,
+ .rci_index = RAPL_RCI_INDEX_GFX,
+ .bic_number = BIC_GFX_J,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1284,7 +1499,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_time_units,
.rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
- .bic = BIC_PKG__,
+ .bic_number = BIC_PKG__,
.compat_scale = 100.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1297,7 +1512,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_time_units,
.rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
- .bic = BIC_RAM__,
+ .bic_number = BIC_RAM__,
.compat_scale = 100.0,
.flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1310,7 +1525,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_energy_units,
.rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
- .bic = BIC_CorWatt | BIC_Cor_J,
+ .bic_number = BIC_CorWatt,
+ .compat_scale = 1.0,
+ .flags = 0,
+ },
+ {
+ .feature_mask = RAPL_AMD_F17H,
+ .perf_subsys = NULL,
+ .perf_name = NULL,
+ .msr = MSR_CORE_ENERGY_STAT,
+ .msr_mask = 0xFFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_energy_units,
+ .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
+ .bic_number = BIC_Cor_J,
.compat_scale = 1.0,
.flags = 0,
},
@@ -1323,7 +1551,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
.msr_shift = 0,
.platform_rapl_msr_scale = &rapl_psys_energy_units,
.rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM,
- .bic = BIC_SysWatt | BIC_Sys_J,
+ .bic_number = BIC_SysWatt,
+ .compat_scale = 1.0,
+ .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM,
+ },
+ {
+ .feature_mask = RAPL_PSYS,
+ .perf_subsys = "power",
+ .perf_name = "energy-psys",
+ .msr = MSR_PLATFORM_ENERGY_STATUS,
+ .msr_mask = 0x00000000FFFFFFFF,
+ .msr_shift = 0,
+ .platform_rapl_msr_scale = &rapl_psys_energy_units,
+ .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM,
+ .bic_number = BIC_Sys_J,
.compat_scale = 1.0,
.flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM,
},
@@ -1372,7 +1613,7 @@ struct cstate_counter_arch_info {
const char *perf_name;
unsigned long long msr;
unsigned int rci_index; /* Maps data from perf counters to global variables */
- unsigned long long bic;
+ unsigned int bic_number;
unsigned long long flags;
int pkg_cstate_limit;
};
@@ -1384,7 +1625,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c1-residency",
.msr = MSR_CORE_C1_RES,
.rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY,
- .bic = BIC_CPU_c1,
+ .bic_number = BIC_CPU_c1,
.flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD,
.pkg_cstate_limit = 0,
},
@@ -1394,7 +1635,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c3-residency",
.msr = MSR_CORE_C3_RESIDENCY,
.rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY,
- .bic = BIC_CPU_c3,
+ .bic_number = BIC_CPU_c3,
.flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
.pkg_cstate_limit = 0,
},
@@ -1404,7 +1645,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c6-residency",
.msr = MSR_CORE_C6_RESIDENCY,
.rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY,
- .bic = BIC_CPU_c6,
+ .bic_number = BIC_CPU_c6,
.flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
.pkg_cstate_limit = 0,
},
@@ -1414,7 +1655,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c7-residency",
.msr = MSR_CORE_C7_RESIDENCY,
.rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY,
- .bic = BIC_CPU_c7,
+ .bic_number = BIC_CPU_c7,
.flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
.pkg_cstate_limit = 0,
},
@@ -1424,7 +1665,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c2-residency",
.msr = MSR_PKG_C2_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY,
- .bic = BIC_Pkgpc2,
+ .bic_number = BIC_Pkgpc2,
.flags = 0,
.pkg_cstate_limit = PCL__2,
},
@@ -1434,7 +1675,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c3-residency",
.msr = MSR_PKG_C3_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY,
- .bic = BIC_Pkgpc3,
+ .bic_number = BIC_Pkgpc3,
.flags = 0,
.pkg_cstate_limit = PCL__3,
},
@@ -1444,7 +1685,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c6-residency",
.msr = MSR_PKG_C6_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY,
- .bic = BIC_Pkgpc6,
+ .bic_number = BIC_Pkgpc6,
.flags = 0,
.pkg_cstate_limit = PCL__6,
},
@@ -1454,7 +1695,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c7-residency",
.msr = MSR_PKG_C7_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY,
- .bic = BIC_Pkgpc7,
+ .bic_number = BIC_Pkgpc7,
.flags = 0,
.pkg_cstate_limit = PCL__7,
},
@@ -1464,7 +1705,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c8-residency",
.msr = MSR_PKG_C8_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY,
- .bic = BIC_Pkgpc8,
+ .bic_number = BIC_Pkgpc8,
.flags = 0,
.pkg_cstate_limit = PCL__8,
},
@@ -1474,7 +1715,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c9-residency",
.msr = MSR_PKG_C9_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY,
- .bic = BIC_Pkgpc9,
+ .bic_number = BIC_Pkgpc9,
.flags = 0,
.pkg_cstate_limit = PCL__9,
},
@@ -1484,7 +1725,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
.perf_name = "c10-residency",
.msr = MSR_PKG_C10_RESIDENCY,
.rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY,
- .bic = BIC_Pkgpc10,
+ .bic_number = BIC_Pkgpc10,
.flags = 0,
.pkg_cstate_limit = PCL_10,
},
@@ -1840,8 +2081,6 @@ struct pkg_data {
((node_no) * topo.cores_per_node) + \
(core_no))
-#define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
-
/*
* The accumulated sum of MSR is defined as a monotonic
* increasing MSR, it will be accumulated periodically,
@@ -2036,6 +2275,7 @@ struct platform_counters {
struct cpu_topology {
int physical_package_id;
int die_id;
+ int l3_id;
int logical_cpu_id;
int physical_node_id;
int logical_node_id; /* 0-based count within the package */
@@ -2057,6 +2297,7 @@ struct topo_params {
int max_core_id;
int max_package_id;
int max_die_id;
+ int max_l3_id;
int max_node_num;
int nodes_per_pkg;
int cores_per_node;
@@ -2090,6 +2331,8 @@ int cpu_is_not_allowed(int cpu)
* skip non-present cpus
*/
+#define PER_THREAD_PARAMS struct thread_data *t, struct core_data *c, struct pkg_data *p
+
int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
{
@@ -2103,16 +2346,15 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
struct thread_data *t;
struct core_data *c;
- struct pkg_data *p;
+
t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
if (cpu_is_not_allowed(t->cpu_id))
continue;
c = GET_CORE(core_base, core_no, node_no, pkg_no);
- p = GET_PKG(pkg_base, pkg_no);
- retval |= func(t, c, p);
+ retval |= func(t, c, &pkg_base[pkg_no]);
}
}
}
@@ -2120,21 +2362,21 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
return retval;
}
-int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int is_cpu_first_thread_in_core(PER_THREAD_PARAMS)
{
UNUSED(p);
return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
}
-int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int is_cpu_first_core_in_package(PER_THREAD_PARAMS)
{
UNUSED(c);
return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
}
-int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int is_cpu_first_thread_in_package(PER_THREAD_PARAMS)
{
return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
}
@@ -2179,10 +2421,13 @@ int get_msr_fd(int cpu)
static void bic_disable_msr_access(void)
{
- const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp |
- BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp;
-
- bic_enabled &= ~bic_msrs;
+ CLR_BIC(BIC_Mod_c6, &bic_enabled);
+ CLR_BIC(BIC_CoreTmp, &bic_enabled);
+ CLR_BIC(BIC_Totl_c0, &bic_enabled);
+ CLR_BIC(BIC_Any_c0, &bic_enabled);
+ CLR_BIC(BIC_GFX_c0, &bic_enabled);
+ CLR_BIC(BIC_CPUGFX, &bic_enabled);
+ CLR_BIC(BIC_PkgTmp, &bic_enabled);
free_sys_msr_counters();
}
@@ -2310,6 +2555,8 @@ char *deferred_add_names[MAX_DEFERRED];
char *deferred_skip_names[MAX_DEFERRED];
int deferred_add_index;
int deferred_skip_index;
+unsigned int deferred_add_consumed;
+unsigned int deferred_skip_consumed;
/*
* HIDE_LIST - hide this list of counters, show the rest [default]
@@ -2380,10 +2627,9 @@ void help(void)
* for all the strings in comma separate name_list,
* set the approprate bit in return value.
*/
-unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
+void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode)
{
unsigned int i;
- unsigned long long retval = 0;
while (name_list) {
char *comma;
@@ -2395,41 +2641,39 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
for (i = 0; i < MAX_BIC; ++i) {
if (!strcmp(name_list, bic[i].name)) {
- retval |= (1ULL << i);
+ SET_BIC(i, ret_set);
break;
}
if (!strcmp(name_list, "all")) {
- retval |= ~0;
+ bic_set_all(ret_set);
break;
} else if (!strcmp(name_list, "topology")) {
- retval |= BIC_GROUP_TOPOLOGY;
+ CPU_OR(ret_set, ret_set, &bic_group_topology);
break;
} else if (!strcmp(name_list, "power")) {
- retval |= BIC_GROUP_THERMAL_PWR;
+ CPU_OR(ret_set, ret_set, &bic_group_thermal_pwr);
break;
} else if (!strcmp(name_list, "idle")) {
- retval |= BIC_GROUP_IDLE;
+ CPU_OR(ret_set, ret_set, &bic_group_idle);
break;
} else if (!strcmp(name_list, "swidle")) {
- retval |= BIC_GROUP_SW_IDLE;
+ CPU_OR(ret_set, ret_set, &bic_group_sw_idle);
break;
} else if (!strcmp(name_list, "sysfs")) { /* legacy compatibility */
- retval |= BIC_GROUP_SW_IDLE;
+ CPU_OR(ret_set, ret_set, &bic_group_sw_idle);
break;
} else if (!strcmp(name_list, "hwidle")) {
- retval |= BIC_GROUP_HW_IDLE;
+ CPU_OR(ret_set, ret_set, &bic_group_hw_idle);
break;
} else if (!strcmp(name_list, "frequency")) {
- retval |= BIC_GROUP_FREQUENCY;
+ CPU_OR(ret_set, ret_set, &bic_group_frequency);
break;
} else if (!strcmp(name_list, "other")) {
- retval |= BIC_OTHER;
+ CPU_OR(ret_set, ret_set, &bic_group_other);
break;
}
-
}
if (i == MAX_BIC) {
- fprintf(stderr, "deferred %s\n", name_list);
if (mode == SHOW_LIST) {
deferred_add_names[deferred_add_index++] = name_list;
if (deferred_add_index >= MAX_DEFERRED) {
@@ -2456,7 +2700,6 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
name_list++;
}
- return retval;
}
void print_header(char *delim)
@@ -2474,6 +2717,8 @@ void print_header(char *delim)
outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
if (DO_BIC(BIC_Die))
outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
+ if (DO_BIC(BIC_L3))
+ outp += sprintf(outp, "%sL3", (printed++ ? delim : ""));
if (DO_BIC(BIC_Node))
outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
if (DO_BIC(BIC_Core))
@@ -2514,7 +2759,7 @@ void print_header(char *delim)
for (mp = sys.tp; mp; mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 64)
outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
else
@@ -2589,7 +2834,7 @@ void print_header(char *delim)
}
for (mp = sys.cp; mp; mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 64)
outp += sprintf(outp, "%s%18.18s", delim, mp->name);
else
@@ -2719,7 +2964,7 @@ void print_header(char *delim)
outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : ""));
for (mp = sys.pp; mp; mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 64)
outp += sprintf(outp, "%s%18.18s", delim, mp->name);
else if (mp->width == 32)
@@ -2777,7 +3022,7 @@ void print_header(char *delim)
outp += sprintf(outp, "\n");
}
-int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int dump_counters(PER_THREAD_PARAMS)
{
int i;
struct msr_counter *mp;
@@ -2892,7 +3137,7 @@ double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desir
/*
* column formatting convention & formats
*/
-int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int format_counters(PER_THREAD_PARAMS)
{
static int count;
@@ -2945,6 +3190,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
if (DO_BIC(BIC_Die))
outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
+ if (DO_BIC(BIC_L3))
+ outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
if (DO_BIC(BIC_Node))
outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
if (DO_BIC(BIC_Core))
@@ -2968,6 +3215,12 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
else
outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
}
+ if (DO_BIC(BIC_L3)) {
+ if (c)
+ outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].l3_id);
+ else
+ outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
+ }
if (DO_BIC(BIC_Node)) {
if (t)
outp += sprintf(outp, "%s%d",
@@ -3032,7 +3285,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
/* Added counters */
for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 32)
outp +=
sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
@@ -3129,7 +3382,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 32)
outp +=
sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
@@ -3328,7 +3581,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW) {
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) {
if (mp->width == 32)
outp +=
sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
@@ -3426,7 +3679,7 @@ void flush_output_stderr(void)
outp = output_buffer;
}
-void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+void format_all_counters(PER_THREAD_PARAMS)
{
static int count;
@@ -3505,7 +3758,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW)
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
old->counter[i] = new->counter[i];
else if (mp->format == FORMAT_AVERAGE)
old->counter[i] = new->counter[i];
@@ -3549,7 +3802,7 @@ void delta_core(struct core_data *new, struct core_data *old)
DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW)
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
old->counter[i] = new->counter[i];
else
old->counter[i] = new->counter[i] - old->counter[i];
@@ -3663,7 +3916,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
old->smi_count = new->smi_count - old->smi_count;
for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
- if (mp->format == FORMAT_RAW)
+ if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
old->counter[i] = new->counter[i];
else
old->counter[i] = new->counter[i] - old->counter[i];
@@ -3717,7 +3970,7 @@ void rapl_counter_clear(struct rapl_counter *c)
c->unit = RAPL_UNIT_INVALID;
}
-void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+void clear_counters(PER_THREAD_PARAMS)
{
int i;
struct msr_counter *mp;
@@ -3814,7 +4067,7 @@ void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter
dst->raw_value += src->raw_value;
}
-int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int sum_counters(PER_THREAD_PARAMS)
{
int i;
struct msr_counter *mp;
@@ -3962,7 +4215,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
* sum the counters for all cpus in the system
* compute the weighted average
*/
-void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+void compute_average(PER_THREAD_PARAMS)
{
int i;
struct msr_counter *mp;
@@ -4545,7 +4798,7 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id)
return NULL;
}
-int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int get_cstate_counters(unsigned int cpu, PER_THREAD_PARAMS)
{
/*
* Overcommit memory a little bit here,
@@ -4845,7 +5098,7 @@ static inline int get_rapl_domain_id(int cpu)
* migrate to cpu
* acquire and record local counters for that cpu
*/
-int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int get_counters(PER_THREAD_PARAMS)
{
int cpu = t->cpu_id;
unsigned long long msr;
@@ -5673,6 +5926,11 @@ int get_die_id(int cpu)
return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
}
+int get_l3_id(int cpu)
+{
+ return parse_int_file("/sys/devices/system/cpu/cpu%d/cache/index3/id", cpu);
+}
+
int get_core_id(int cpu)
{
return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
@@ -5861,7 +6119,6 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
struct thread_data *t, *t2;
struct core_data *c, *c2;
- struct pkg_data *p, *p2;
t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
@@ -5873,10 +6130,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
c = GET_CORE(core_base, core_no, node_no, pkg_no);
c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
- p = GET_PKG(pkg_base, pkg_no);
- p2 = GET_PKG(pkg_base2, pkg_no);
-
- retval |= func(t, c, p, t2, c2, p2);
+ retval |= func(t, c, &pkg_base[pkg_no], t2, c2, &pkg_base2[pkg_no]);
}
}
}
@@ -6334,7 +6588,7 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
timer_t timerid;
/* Timer callback, update the sum of MSRs periodically. */
-static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+static int update_msr_sum(PER_THREAD_PARAMS)
{
int i, ret;
int cpu = t->cpu_id;
@@ -6572,8 +6826,16 @@ int check_for_cap_sys_rawio(void)
int ret = 0;
caps = cap_get_proc();
- if (caps == NULL)
+ if (caps == NULL) {
+ /*
+ * CONFIG_MULTIUSER=n kernels have no cap_get_proc()
+ * Allow them to continue and attempt to access MSRs
+ */
+ if (errno == ENOSYS)
+ return 0;
+
return 1;
+ }
if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
ret = 1;
@@ -6740,7 +7002,8 @@ static void probe_intel_uncore_frequency_legacy(void)
sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
j);
- if (access(path_base, R_OK))
+ sprintf(path, "%s/current_freq_khz", path_base);
+ if (access(path, R_OK))
continue;
BIC_PRESENT(BIC_UNCORE_MHZ);
@@ -7072,7 +7335,7 @@ static void dump_sysfs_pstate_config(void)
* print_epb()
* Decode the ENERGY_PERF_BIAS MSR
*/
-int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int print_epb(PER_THREAD_PARAMS)
{
char *epb_string;
int cpu, epb;
@@ -7121,7 +7384,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
* print_hwp()
* Decode the MSR_HWP_CAPABILITIES
*/
-int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int print_hwp(PER_THREAD_PARAMS)
{
unsigned long long msr;
int cpu;
@@ -7210,7 +7473,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
/*
* print_perf_limit()
*/
-int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int print_perf_limit(PER_THREAD_PARAMS)
{
unsigned long long msr;
int cpu;
@@ -7335,21 +7598,28 @@ void rapl_probe_intel(void)
unsigned long long msr;
unsigned int time_unit;
double tdp;
- const unsigned long long bic_watt_bits = BIC_SysWatt | BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt;
- const unsigned long long bic_joules_bits = BIC_Sys_J | BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J;
- if (rapl_joules)
- bic_enabled &= ~bic_watt_bits;
- else
- bic_enabled &= ~bic_joules_bits;
+ if (rapl_joules) {
+ CLR_BIC(BIC_SysWatt, &bic_enabled);
+ CLR_BIC(BIC_PkgWatt, &bic_enabled);
+ CLR_BIC(BIC_CorWatt, &bic_enabled);
+ CLR_BIC(BIC_RAMWatt, &bic_enabled);
+ CLR_BIC(BIC_GFXWatt, &bic_enabled);
+ } else {
+ CLR_BIC(BIC_Sys_J, &bic_enabled);
+ CLR_BIC(BIC_Pkg_J, &bic_enabled);
+ CLR_BIC(BIC_Cor_J, &bic_enabled);
+ CLR_BIC(BIC_RAM_J, &bic_enabled);
+ CLR_BIC(BIC_GFX_J, &bic_enabled);
+ }
if (!platform->rapl_msrs || no_msr)
return;
if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
- bic_enabled &= ~BIC_PKG__;
+ CLR_BIC(BIC_PKG__, &bic_enabled);
if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
- bic_enabled &= ~BIC_RAM__;
+ CLR_BIC(BIC_RAM__, &bic_enabled);
/* units on package 0, verify later other packages match */
if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
@@ -7388,13 +7658,14 @@ void rapl_probe_amd(void)
{
unsigned long long msr;
double tdp;
- const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt;
- const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J;
- if (rapl_joules)
- bic_enabled &= ~bic_watt_bits;
- else
- bic_enabled &= ~bic_joules_bits;
+ if (rapl_joules) {
+ CLR_BIC(BIC_SysWatt, &bic_enabled);
+ CLR_BIC(BIC_CorWatt, &bic_enabled);
+ } else {
+ CLR_BIC(BIC_Pkg_J, &bic_enabled);
+ CLR_BIC(BIC_Cor_J, &bic_enabled);
+ }
if (!platform->rapl_msrs || no_msr)
return;
@@ -7577,7 +7848,7 @@ static int print_rapl_sysfs(void)
return 0;
}
-int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int print_rapl(PER_THREAD_PARAMS)
{
unsigned long long msr;
const char *msr_name;
@@ -7731,7 +8002,7 @@ void probe_rapl(void)
* below this value, including the Digital Thermal Sensor (DTS),
* Package Thermal Management Sensor (PTM), and thermal event thresholds.
*/
-int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int set_temperature_target(PER_THREAD_PARAMS)
{
unsigned long long msr;
unsigned int tcc_default, tcc_offset;
@@ -7799,7 +8070,7 @@ guess:
return 0;
}
-int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int print_thermal(PER_THREAD_PARAMS)
{
unsigned long long msr;
unsigned int dts, dts2;
@@ -7879,7 +8150,7 @@ void probe_thermal(void)
for_all_cpus(print_thermal, ODD_COUNTERS);
}
-int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int get_cpu_type(PER_THREAD_PARAMS)
{
unsigned int eax, ebx, ecx, edx;
@@ -8141,7 +8412,7 @@ void rapl_perf_init(void)
enum rapl_unit unit;
unsigned int next_domain;
- if (!BIC_IS_ENABLED(cai->bic))
+ if (!BIC_IS_ENABLED(cai->bic_number))
continue;
memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
@@ -8205,7 +8476,7 @@ void rapl_perf_init(void)
/* If any CPU has access to the counter, make it present */
if (has_counter)
- BIC_PRESENT(cai->bic);
+ BIC_PRESENT(cai->bic_number);
}
free(domain_visited);
@@ -8426,7 +8697,7 @@ void cstate_perf_init_(bool soft_c1)
if (!per_core && pkg_visited[pkg_id])
continue;
- const bool counter_needed = BIC_IS_ENABLED(cai->bic) ||
+ const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) ||
(soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
const bool counter_supported = (platform->supported_cstates & cai->feature_mask);
@@ -8453,7 +8724,7 @@ void cstate_perf_init_(bool soft_c1)
/* If any CPU has access to the counter, make it present */
if (has_counter)
- BIC_PRESENT(cai->bic);
+ BIC_PRESENT(cai->bic_number);
}
free(cores_visited);
@@ -8949,6 +9220,11 @@ void topology_probe(bool startup)
if (cpus[i].die_id > topo.max_die_id)
topo.max_die_id = cpus[i].die_id;
+ /* get l3 information */
+ cpus[i].l3_id = get_l3_id(i);
+ if (cpus[i].l3_id > topo.max_l3_id)
+ topo.max_l3_id = cpus[i].l3_id;
+
/* get numa node information */
cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
if (cpus[i].physical_node_id > topo.max_node_num)
@@ -8981,6 +9257,9 @@ void topology_probe(bool startup)
if (!summary_only && topo.num_die > 1)
BIC_PRESENT(BIC_Die);
+ if (!summary_only && topo.max_l3_id > 0)
+ BIC_PRESENT(BIC_L3);
+
topo.num_packages = max_package_id + 1;
if (debug > 1)
fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
@@ -9004,8 +9283,8 @@ void topology_probe(bool startup)
if (cpu_is_not_present(i))
continue;
fprintf(outf,
- "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
- i, cpus[i].physical_package_id, cpus[i].die_id,
+ "cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n",
+ i, cpus[i].physical_package_id, cpus[i].die_id, cpus[i].l3_id,
cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
}
@@ -9060,7 +9339,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
int thread_id = cpus[cpu_id].thread_id;
struct thread_data *t;
struct core_data *c;
- struct pkg_data *p;
/* Workaround for systems where physical_node_id==-1
* and logical_node_id==(-1 - topo.num_cpus)
@@ -9070,18 +9348,17 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
c = GET_CORE(core_base, core_id, node_id, pkg_id);
- p = GET_PKG(pkg_base, pkg_id);
t->cpu_id = cpu_id;
if (!cpu_is_not_allowed(cpu_id)) {
if (c->base_cpu < 0)
c->base_cpu = t->cpu_id;
- if (p->base_cpu < 0)
- p->base_cpu = t->cpu_id;
+ if (pkg_base[pkg_id].base_cpu < 0)
+ pkg_base[pkg_id].base_cpu = t->cpu_id;
}
c->core_id = core_id;
- p->package_id = pkg_id;
+ pkg_base[pkg_id].package_id = pkg_id;
}
int initialize_counters(int cpu_id)
@@ -9121,7 +9398,7 @@ void allocate_irq_buffers(void)
err(-1, "calloc %d NMI", topo.max_cpu_num + 1);
}
-int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+int update_topo(PER_THREAD_PARAMS)
{
topo.allowed_cpus++;
if ((int)t->cpu_id == c->base_cpu)
@@ -9189,7 +9466,7 @@ void check_msr_access(void)
void check_perf_access(void)
{
if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access())
- bic_enabled &= ~BIC_IPC;
+ CLR_BIC(BIC_IPC, &bic_enabled);
}
bool perf_has_hybrid_devices(void)
@@ -9758,8 +10035,8 @@ void turbostat_init()
* disable more BICs, since it can't be reported accurately.
*/
if (platform->enable_tsc_tweak && !has_base_hz) {
- bic_enabled &= ~BIC_Busy;
- bic_enabled &= ~BIC_Bzy_MHz;
+ CLR_BIC(BIC_Busy, &bic_enabled);
+ CLR_BIC(BIC_Bzy_MHz, &bic_enabled);
}
}
@@ -9817,6 +10094,7 @@ int fork_it(char **argv)
timersub(&tv_odd, &tv_even, &tv_delta);
if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
fprintf(outf, "%s: Counter reset detected\n", progname);
+ delta_platform(&platform_counters_odd, &platform_counters_even);
compute_average(EVEN_COUNTERS);
format_all_counters(EVEN_COUNTERS);
@@ -9848,7 +10126,7 @@ int get_and_dump_counters(void)
void print_version()
{
- fprintf(outf, "turbostat version 2025.06.08 - Len Brown <lenb@kernel.org>\n");
+ fprintf(outf, "turbostat version 2025.09.09 - Len Brown <lenb@kernel.org>\n");
}
#define COMMAND_LINE_SIZE 2048
@@ -10145,6 +10423,10 @@ void parse_add_command_msr(char *add_command)
format = FORMAT_RAW;
goto next;
}
+ if (!strncmp(add_command, "average", strlen("average"))) {
+ format = FORMAT_AVERAGE;
+ goto next;
+ }
if (!strncmp(add_command, "delta", strlen("delta"))) {
format = FORMAT_DELTA;
goto next;
@@ -10417,13 +10699,19 @@ next:
has_format = true;
}
+ if (strcmp("average", format_name) == 0) {
+ format = FORMAT_AVERAGE;
+ has_format = true;
+ }
+
if (strcmp("delta", format_name) == 0) {
format = FORMAT_DELTA;
has_format = true;
}
if (!has_format) {
- fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name);
+ fprintf(stderr, "%s: Invalid format %s. Expected raw, average or delta\n",
+ __func__, format_name);
exit(1);
}
}
@@ -10513,8 +10801,10 @@ int is_deferred_add(char *name)
int i;
for (i = 0; i < deferred_add_index; ++i)
- if (!strcmp(name, deferred_add_names[i]))
+ if (!strcmp(name, deferred_add_names[i])) {
+ deferred_add_consumed |= (1 << i);
return 1;
+ }
return 0;
}
@@ -10523,11 +10813,34 @@ int is_deferred_skip(char *name)
int i;
for (i = 0; i < deferred_skip_index; ++i)
- if (!strcmp(name, deferred_skip_names[i]))
+ if (!strcmp(name, deferred_skip_names[i])) {
+ deferred_skip_consumed |= (1 << i);
return 1;
+ }
return 0;
}
+void verify_deferred_consumed(void)
+{
+ int i;
+ int fail = 0;
+
+ for (i = 0; i < deferred_add_index; ++i) {
+ if (!(deferred_add_consumed & (1 << i))) {
+ warnx("Counter '%s' can not be added.", deferred_add_names[i]);
+ fail++;
+ }
+ }
+ for (i = 0; i < deferred_skip_index; ++i) {
+ if (!(deferred_skip_consumed & (1 << i))) {
+ warnx("Counter '%s' can not be skipped.", deferred_skip_names[i]);
+ fail++;
+ }
+ }
+ if (fail)
+ exit(-EINVAL);
+}
+
void probe_cpuidle_residency(void)
{
char path[64];
@@ -10537,9 +10850,6 @@ void probe_cpuidle_residency(void)
int min_state = 1024, max_state = 0;
char *sp;
- if (!DO_BIC(BIC_pct_idle))
- return;
-
for (state = 10; state >= 0; --state) {
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
@@ -10752,22 +11062,29 @@ void cmdline(int argc, char **argv)
no_perf = 1;
break;
case 'e':
- /* --enable specified counter */
- bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
+ /* --enable specified counter, without clearning existing list */
+ bic_lookup(&bic_enabled, optarg, SHOW_LIST);
break;
case 'f':
force_load++;
break;
case 'd':
debug++;
- ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
+ bic_set_all(&bic_enabled);
break;
case 'H':
/*
* --hide: do not show those specified
* multiple invocations simply clear more bits in enabled mask
*/
- bic_enabled &= ~bic_lookup(optarg, HIDE_LIST);
+ {
+ cpu_set_t bic_group_hide;
+
+ BIC_INIT(&bic_group_hide);
+
+ bic_lookup(&bic_group_hide, optarg, HIDE_LIST);
+ bic_clear_bits(&bic_enabled, &bic_group_hide);
+ }
break;
case 'h':
default:
@@ -10791,7 +11108,7 @@ void cmdline(int argc, char **argv)
rapl_joules++;
break;
case 'l':
- ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
+ bic_set_all(&bic_enabled);
list_header_only++;
quiet++;
break;
@@ -10828,9 +11145,8 @@ void cmdline(int argc, char **argv)
* subsequent invocations can add to it.
*/
if (shown == 0)
- bic_enabled = bic_lookup(optarg, SHOW_LIST);
- else
- bic_enabled |= bic_lookup(optarg, SHOW_LIST);
+ BIC_INIT(&bic_enabled);
+ bic_lookup(&bic_enabled, optarg, SHOW_LIST);
shown = 1;
break;
case 'S':
@@ -10867,6 +11183,8 @@ int main(int argc, char **argv)
{
int fd, ret;
+ bic_groups_init();
+
fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
if (fd < 0)
goto skip_cgroup_setting;
@@ -10889,6 +11207,8 @@ skip_cgroup_setting:
probe_cpuidle_residency();
probe_cpuidle_counts();
+ verify_deferred_consumed();
+
if (!getuid())
set_rlimit();
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 530752ddde8e..c1cfd297aabf 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -229,7 +229,8 @@
#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \
(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \
- (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64))
+ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) || \
+ (defined(__TARGET_ARCH_powerpc))
#define CAN_USE_LOAD_ACQ_STORE_REL
#endif
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
index 632ab44737ec..c952640f163b 100644
--- a/tools/testing/selftests/cachestat/test_cachestat.c
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs)
cs->nr_evicted, cs->nr_recently_evicted);
}
+enum file_type {
+ FILE_MMAP,
+ FILE_SHMEM
+};
+
bool write_exactly(int fd, size_t filesize)
{
int random_fd = open("/dev/urandom", O_RDONLY);
@@ -201,8 +206,20 @@ out1:
out:
return ret;
}
+const char *file_type_str(enum file_type type)
+{
+ switch (type) {
+ case FILE_SHMEM:
+ return "shmem";
+ case FILE_MMAP:
+ return "mmap";
+ default:
+ return "unknown";
+ }
+}
-bool test_cachestat_shmem(void)
+
+bool run_cachestat_test(enum file_type type)
{
size_t PS = sysconf(_SC_PAGESIZE);
size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */
@@ -212,27 +229,50 @@ bool test_cachestat_shmem(void)
char *filename = "tmpshmcstat";
struct cachestat cs;
bool ret = true;
+ int fd;
unsigned long num_pages = compute_len / PS;
- int fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ if (type == FILE_SHMEM)
+ fd = shm_open(filename, O_CREAT | O_RDWR, 0600);
+ else
+ fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd < 0) {
- ksft_print_msg("Unable to create shmem file.\n");
+ ksft_print_msg("Unable to create %s file.\n",
+ file_type_str(type));
ret = false;
goto out;
}
if (ftruncate(fd, filesize)) {
- ksft_print_msg("Unable to truncate shmem file.\n");
+ ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type));
ret = false;
goto close_fd;
}
+ switch (type) {
+ case FILE_SHMEM:
+ if (!write_exactly(fd, filesize)) {
+ ksft_print_msg("Unable to write to file.\n");
+ ret = false;
+ goto close_fd;
+ }
+ break;
+ case FILE_MMAP:
+ char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
- if (!write_exactly(fd, filesize)) {
- ksft_print_msg("Unable to write to shmem file.\n");
+ if (map == MAP_FAILED) {
+ ksft_print_msg("mmap failed.\n");
+ ret = false;
+ goto close_fd;
+ }
+ for (int i = 0; i < filesize; i++)
+ map[i] = 'A';
+ break;
+ default:
+ ksft_print_msg("Unsupported file type.\n");
ret = false;
goto close_fd;
}
-
syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0);
if (syscall_ret) {
@@ -308,12 +348,18 @@ int main(void)
break;
}
- if (test_cachestat_shmem())
+ if (run_cachestat_test(FILE_SHMEM))
ksft_test_result_pass("cachestat works with a shmem file\n");
else {
ksft_test_result_fail("cachestat fails with a shmem file\n");
ret = 1;
}
+ if (run_cachestat_test(FILE_MMAP))
+ ksft_test_result_pass("cachestat works with a mmap file\n");
+ else {
+ ksft_test_result_fail("cachestat fails with a mmap file\n");
+ ret = 1;
+ }
return ret;
}
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 3556f3563e08..984ece05f7f9 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -11,6 +11,7 @@ TEST_GEN_FILES := \
TEST_PROGS := \
napi_id.py \
+ napi_threaded.py \
netcons_basic.sh \
netcons_cmdline.sh \
netcons_fragmented_msg.sh \
diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py
new file mode 100755
index 000000000000..b2698db39817
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/napi_threaded.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Test napi threaded states.
+"""
+
+from lib.py import ksft_run, ksft_exit
+from lib.py import ksft_eq, ksft_ne, ksft_ge
+from lib.py import NetDrvEnv, NetdevFamily
+from lib.py import cmd, defer, ethtool
+
+
+def _assert_napi_threaded_enabled(nl, napi_id) -> None:
+ napi = nl.napi_get({'id': napi_id})
+ ksft_eq(napi['threaded'], 'enabled')
+ ksft_ne(napi.get('pid'), None)
+
+
+def _assert_napi_threaded_disabled(nl, napi_id) -> None:
+ napi = nl.napi_get({'id': napi_id})
+ ksft_eq(napi['threaded'], 'disabled')
+ ksft_eq(napi.get('pid'), None)
+
+
+def _set_threaded_state(cfg, threaded) -> None:
+ cmd(f"echo {threaded} > /sys/class/net/{cfg.ifname}/threaded")
+
+
+def _setup_deferred_cleanup(cfg) -> None:
+ combined = ethtool(f"-l {cfg.ifname}", json=True)[0].get("combined", 0)
+ ksft_ge(combined, 2)
+ defer(ethtool, f"-L {cfg.ifname} combined {combined}")
+
+ threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout
+ defer(_set_threaded_state, cfg, threaded)
+
+
+def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None:
+ """
+ Test that when napi threaded is enabled at device level and
+ then disabled at napi level for one napi, the threaded state
+ of all napis is preserved after a change in number of queues.
+ """
+
+ napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True)
+ ksft_ge(len(napis), 2)
+
+ napi0_id = napis[0]['id']
+ napi1_id = napis[1]['id']
+
+ _setup_deferred_cleanup(cfg)
+
+ # set threaded
+ _set_threaded_state(cfg, 1)
+
+ # check napi threaded is set for both napis
+ _assert_napi_threaded_enabled(nl, napi0_id)
+ _assert_napi_threaded_enabled(nl, napi1_id)
+
+ # disable threaded for napi1
+ nl.napi_set({'id': napi1_id, 'threaded': 'disabled'})
+
+ cmd(f"ethtool -L {cfg.ifname} combined 1")
+ cmd(f"ethtool -L {cfg.ifname} combined 2")
+ _assert_napi_threaded_enabled(nl, napi0_id)
+ _assert_napi_threaded_disabled(nl, napi1_id)
+
+
+def change_num_queues(cfg, nl) -> None:
+ """
+ Test that when napi threaded is enabled at device level,
+ the napi threaded state is preserved after a change in
+ number of queues.
+ """
+
+ napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True)
+ ksft_ge(len(napis), 2)
+
+ napi0_id = napis[0]['id']
+ napi1_id = napis[1]['id']
+
+ _setup_deferred_cleanup(cfg)
+
+ # set threaded
+ _set_threaded_state(cfg, 1)
+
+ # check napi threaded is set for both napis
+ _assert_napi_threaded_enabled(nl, napi0_id)
+ _assert_napi_threaded_enabled(nl, napi1_id)
+
+ cmd(f"ethtool -L {cfg.ifname} combined 1")
+ cmd(f"ethtool -L {cfg.ifname} combined 2")
+
+ # check napi threaded is set for both napis
+ _assert_napi_threaded_enabled(nl, napi0_id)
+ _assert_napi_threaded_enabled(nl, napi1_id)
+
+
+def main() -> None:
+ """ Ksft boiler plate main """
+
+ with NetDrvEnv(__file__, queue_count=2) as cfg:
+ ksft_run([change_num_queues,
+ enable_dev_threaded_disable_napi_threaded],
+ args=(cfg, NetdevFamily()))
+ ksft_exit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh
index e8e0dc088d6a..01d0c044a5fc 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh
@@ -1053,6 +1053,6 @@ trap cleanup EXIT
setup_prepare
-tests_run
+xfail_on_slow tests_run
exit $EXIT_STATUS
diff --git a/tools/testing/selftests/kho/arm64.conf b/tools/testing/selftests/kho/arm64.conf
new file mode 100644
index 000000000000..ee696807cd35
--- /dev/null
+++ b/tools/testing/selftests/kho/arm64.conf
@@ -0,0 +1,9 @@
+QEMU_CMD="qemu-system-aarch64 -M virt -cpu max"
+QEMU_KCONFIG="
+CONFIG_SERIAL_AMBA_PL010=y
+CONFIG_SERIAL_AMBA_PL010_CONSOLE=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+"
+KERNEL_IMAGE="Image"
+KERNEL_CMDLINE="console=ttyAMA0"
diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c
new file mode 100644
index 000000000000..8034e24c6bf6
--- /dev/null
+++ b/tools/testing/selftests/kho/init.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef NOLIBC
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/reboot.h>
+#endif
+
+/* from arch/x86/include/asm/setup.h */
+#define COMMAND_LINE_SIZE 2048
+
+/* from include/linux/kexex.h */
+#define KEXEC_FILE_NO_INITRAMFS 0x00000004
+
+#define KHO_FINILIZE "/debugfs/kho/out/finalize"
+#define KERNEL_IMAGE "/kernel"
+
+static int mount_filesystems(void)
+{
+ if (mount("debugfs", "/debugfs", "debugfs", 0, NULL) < 0)
+ return -1;
+
+ return mount("proc", "/proc", "proc", 0, NULL);
+}
+
+static int kho_enable(void)
+{
+ const char enable[] = "1";
+ int fd;
+
+ fd = open(KHO_FINILIZE, O_RDWR);
+ if (fd < 0)
+ return -1;
+
+ if (write(fd, enable, sizeof(enable)) != sizeof(enable))
+ return 1;
+
+ close(fd);
+ return 0;
+}
+
+static long kexec_file_load(int kernel_fd, int initrd_fd,
+ unsigned long cmdline_len, const char *cmdline,
+ unsigned long flags)
+{
+ return syscall(__NR_kexec_file_load, kernel_fd, initrd_fd, cmdline_len,
+ cmdline, flags);
+}
+
+static int kexec_load(void)
+{
+ char cmdline[COMMAND_LINE_SIZE];
+ ssize_t len;
+ int fd, err;
+
+ fd = open("/proc/cmdline", O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ len = read(fd, cmdline, sizeof(cmdline));
+ close(fd);
+ if (len < 0)
+ return -1;
+
+ /* replace \n with \0 */
+ cmdline[len - 1] = 0;
+ fd = open(KERNEL_IMAGE, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ err = kexec_file_load(fd, -1, len, cmdline, KEXEC_FILE_NO_INITRAMFS);
+ close(fd);
+
+ return err ? : 0;
+}
+
+int main(int argc, char *argv[])
+{
+ if (mount_filesystems())
+ goto err_reboot;
+
+ if (kho_enable())
+ goto err_reboot;
+
+ if (kexec_load())
+ goto err_reboot;
+
+ if (reboot(RB_KEXEC))
+ goto err_reboot;
+
+ return 0;
+
+err_reboot:
+ reboot(RB_AUTOBOOT);
+ return -1;
+}
diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh
new file mode 100755
index 000000000000..ec70a17bd476
--- /dev/null
+++ b/tools/testing/selftests/kho/vmtest.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -ue
+
+CROSS_COMPILE="${CROSS_COMPILE:-""}"
+
+test_dir=$(realpath "$(dirname "$0")")
+kernel_dir=$(realpath "$test_dir/../../../..")
+
+tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX)
+headers_dir="$tmp_dir/usr"
+initrd_dir="$tmp_dir/initrd"
+initrd="$tmp_dir/initrd.cpio"
+
+source "$test_dir/../kselftest/ktap_helpers.sh"
+
+function usage() {
+ cat <<EOF
+$0 [-d build_dir] [-j jobs] [-t target_arch] [-h]
+Options:
+ -d) path to the kernel build directory
+ -j) number of jobs for compilation, similar to -j in make
+ -t) run test for target_arch, requires CROSS_COMPILE set
+ supported targets: aarch64, x86_64
+ -h) display this help
+EOF
+}
+
+function cleanup() {
+ rm -fr "$tmp_dir"
+ ktap_finished
+}
+trap cleanup EXIT
+
+function skip() {
+ local msg=${1:-""}
+
+ ktap_test_skip "$msg"
+ exit "$KSFT_SKIP"
+}
+
+function fail() {
+ local msg=${1:-""}
+
+ ktap_test_fail "$msg"
+ exit "$KSFT_FAIL"
+}
+
+function build_kernel() {
+ local build_dir=$1
+ local make_cmd=$2
+ local arch_kconfig=$3
+ local kimage=$4
+
+ local kho_config="$tmp_dir/kho.config"
+ local kconfig="$build_dir/.config"
+
+ # enable initrd, KHO and KHO test in kernel configuration
+ tee "$kconfig" > "$kho_config" <<EOF
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KEXEC_HANDOVER=y
+CONFIG_TEST_KEXEC_HANDOVER=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_VM=y
+$arch_kconfig
+EOF
+
+ make_cmd="$make_cmd -C $kernel_dir O=$build_dir"
+ $make_cmd olddefconfig
+
+ # verify that kernel confiration has all necessary options
+ while read -r opt ; do
+ grep "$opt" "$kconfig" &>/dev/null || skip "$opt is missing"
+ done < "$kho_config"
+
+ $make_cmd "$kimage"
+ $make_cmd headers_install INSTALL_HDR_PATH="$headers_dir"
+}
+
+function mkinitrd() {
+ local kernel=$1
+
+ mkdir -p "$initrd_dir"/{dev,debugfs,proc}
+ sudo mknod "$initrd_dir/dev/console" c 5 1
+
+ "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \
+ -fno-asynchronous-unwind-tables -fno-ident -nostdlib \
+ -include "$test_dir/../../../include/nolibc/nolibc.h" \
+ -o "$initrd_dir/init" "$test_dir/init.c" \
+
+ cp "$kernel" "$initrd_dir/kernel"
+
+ pushd "$initrd_dir" &>/dev/null
+ find . | cpio -H newc --create > "$initrd" 2>/dev/null
+ popd &>/dev/null
+}
+
+function run_qemu() {
+ local qemu_cmd=$1
+ local cmdline=$2
+ local kernel=$3
+ local serial="$tmp_dir/qemu.serial"
+
+ cmdline="$cmdline kho=on panic=-1"
+
+ $qemu_cmd -m 1G -smp 2 -no-reboot -nographic -nodefaults \
+ -accel kvm -accel hvf -accel tcg \
+ -serial file:"$serial" \
+ -append "$cmdline" \
+ -kernel "$kernel" \
+ -initrd "$initrd"
+
+ grep "KHO restore succeeded" "$serial" &> /dev/null || fail "KHO failed"
+}
+
+function target_to_arch() {
+ local target=$1
+
+ case $target in
+ aarch64) echo "arm64" ;;
+ x86_64) echo "x86" ;;
+ *) skip "architecture $target is not supported"
+ esac
+}
+
+function main() {
+ local build_dir="$kernel_dir/.kho"
+ local jobs=$(($(nproc) * 2))
+ local target="$(uname -m)"
+
+ # skip the test if any of the preparation steps fails
+ set -o errtrace
+ trap skip ERR
+
+ while getopts 'hd:j:t:' opt; do
+ case $opt in
+ d)
+ build_dir="$OPTARG"
+ ;;
+ j)
+ jobs="$OPTARG"
+ ;;
+ t)
+ target="$OPTARG"
+ ;;
+ h)
+ usage
+ exit 0
+ ;;
+ *)
+ echo Unknown argument "$opt"
+ usage
+ exit 1
+ ;;
+ esac
+ done
+
+ ktap_print_header
+ ktap_set_plan 1
+
+ if [[ "$target" != "$(uname -m)" ]] && [[ -z "$CROSS_COMPILE" ]]; then
+ skip "Cross-platform testing needs to specify CROSS_COMPILE"
+ fi
+
+ mkdir -p "$build_dir"
+ local arch=$(target_to_arch "$target")
+ source "$test_dir/$arch.conf"
+
+ # build the kernel and create initrd
+ # initrd includes the kernel image that will be kexec'ed
+ local make_cmd="make ARCH=$arch CROSS_COMPILE=$CROSS_COMPILE -j$jobs"
+ build_kernel "$build_dir" "$make_cmd" "$QEMU_KCONFIG" "$KERNEL_IMAGE"
+
+ local kernel="$build_dir/arch/$arch/boot/$KERNEL_IMAGE"
+ mkinitrd "$kernel"
+
+ run_qemu "$QEMU_CMD" "$KERNEL_CMDLINE" "$kernel"
+
+ ktap_test_pass "KHO succeeded"
+}
+
+main "$@"
diff --git a/tools/testing/selftests/kho/x86.conf b/tools/testing/selftests/kho/x86.conf
new file mode 100644
index 000000000000..b419e610ca22
--- /dev/null
+++ b/tools/testing/selftests/kho/x86.conf
@@ -0,0 +1,7 @@
+QEMU_CMD=qemu-system-x86_64
+QEMU_KCONFIG="
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+"
+KERNEL_IMAGE="bzImage"
+KERNEL_CMDLINE="console=ttyS0"
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index f2dafa0b700b..e7b23a8a05fe 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,6 +21,7 @@ on-fault-limit
transhuge-stress
pagemap_ioctl
pfnmap
+process_madv
*.tmp*
protection_keys
protection_keys_32
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..471cae8427f1
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+FIXTURE(process_madvise)
+{
+ unsigned long page_size;
+ pid_t child_pid;
+ int remote_pidfd;
+ int pidfd;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+ self->pidfd = PIDFD_SELF;
+ self->remote_pidfd = -1;
+ self->child_pid = -1;
+};
+
+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{
+ /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+
+ if (self->remote_pidfd >= 0)
+ close(self->remote_pidfd);
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+ size_t vlen, int advice, unsigned int flags)
+{
+ return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * This test uses PIDFD_SELF to target the current process. The main
+ * goal is to verify the basic behavior of process_madvise() with
+ * a vector of non-contiguous memory ranges, not its cross-process
+ * capabilities.
+ */
+TEST_F(process_madvise, basic)
+{
+ const unsigned long pagesize = self->page_size;
+ const int madvise_pages = 4;
+ struct iovec vec[madvise_pages];
+ int pidfd = self->pidfd;
+ ssize_t ret;
+ char *map;
+
+ /*
+ * Create a single large mapping. We will pick pages from this
+ * mapping to advise on. This ensures we test non-contiguous iovecs.
+ */
+ map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ /* Fill the entire region with a known pattern. */
+ memset(map, 'A', pagesize * 10);
+
+ /*
+ * Setup the iovec to point to 4 non-contiguous pages
+ * within the mapping.
+ */
+ vec[0].iov_base = &map[0 * pagesize];
+ vec[0].iov_len = pagesize;
+ vec[1].iov_base = &map[3 * pagesize];
+ vec[1].iov_len = pagesize;
+ vec[2].iov_base = &map[5 * pagesize];
+ vec[2].iov_len = pagesize;
+ vec[3].iov_base = &map[8 * pagesize];
+ vec[3].iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
+ if (ret == -1 && errno == EPERM)
+ SKIP(return,
+ "process_madvise() unsupported or permission denied, try running as root.\n");
+ else if (errno == EINVAL)
+ SKIP(return,
+ "process_madvise() unsupported or parameter invalid, please check arguments.\n");
+
+ /* The call should succeed and report the total bytes processed. */
+ ASSERT_EQ(ret, madvise_pages * pagesize);
+
+ /* Check that advised pages are now zero. */
+ for (int i = 0; i < madvise_pages; i++) {
+ char *advised_page = (char *)vec[i].iov_base;
+
+ /* Content must be 0, not 'A'. */
+ ASSERT_EQ(*advised_page, '\0');
+ }
+
+ /* Check that an un-advised page in between is still 'A'. */
+ char *unadvised_page = &map[1 * pagesize];
+
+ for (int i = 0; i < pagesize; i++)
+ ASSERT_EQ(unadvised_page[i], 'A');
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+/*
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process,
+ * focus on process_madv remote result, only check addresses and lengths.
+ * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+ const unsigned long pagesize = self->page_size;
+ long huge_page_size;
+ int pipe_info[2];
+ ssize_t ret;
+ struct iovec vec;
+
+ struct child_info {
+ pid_t pid;
+ void *map_addr;
+ } info;
+
+ huge_page_size = read_pmd_pagesize();
+ if (huge_page_size <= 0)
+ SKIP(return, "Could not determine a valid huge page size.\n");
+
+ ASSERT_EQ(pipe(pipe_info), 0);
+
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0) {
+ char *map;
+ size_t map_size = 2 * huge_page_size;
+
+ close(pipe_info[0]);
+
+ map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Fault in as small pages */
+ for (size_t i = 0; i < map_size; i += pagesize)
+ map[i] = 'A';
+
+ /* Send info and pause */
+ info.pid = getpid();
+ info.map_addr = map;
+ ret = write(pipe_info[1], &info, sizeof(info));
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[1]);
+
+ pause();
+ exit(0);
+ }
+
+ close(pipe_info[1]);
+
+ /* Receive child info */
+ ret = read(pipe_info[0], &info, sizeof(info));
+ if (ret <= 0) {
+ waitpid(self->child_pid, NULL, 0);
+ SKIP(return, "Failed to read child info from pipe.\n");
+ }
+ ASSERT_EQ(ret, sizeof(info));
+ close(pipe_info[0]);
+ self->child_pid = info.pid;
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ vec.iov_base = info.map_addr;
+ vec.iov_len = huge_page_size;
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
+ 0);
+ if (ret == -1) {
+ if (errno == EINVAL)
+ SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
+ else if (errno == EPERM)
+ SKIP(return,
+ "No process_madvise() permissions, try running as root.\n");
+ return;
+ }
+
+ ASSERT_EQ(ret, huge_page_size);
+}
+
+/*
+ * Test process_madvise() with a pidfd for a process that has already
+ * exited to ensure correct error handling.
+ */
+TEST_F(process_madvise, exited_process_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /*
+ * Using a pidfd for a process that has already exited should fail
+ * with ESRCH.
+ */
+ self->child_pid = fork();
+ ASSERT_NE(self->child_pid, -1);
+
+ if (self->child_pid == 0)
+ exit(0);
+
+ self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+ ASSERT_GE(self->remote_pidfd, 0);
+
+ /* Wait for the child to ensure it has terminated. */
+ waitpid(self->child_pid, NULL, 0);
+
+ ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
+ 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ESRCH);
+}
+
+/*
+ * Test process_madvise() with bad pidfds to ensure correct error
+ * handling.
+ */
+TEST_F(process_madvise, bad_pidfd)
+{
+ const unsigned long pagesize = self->page_size;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ /* Using an invalid fd number (-1) should fail with EBADF. */
+ ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+
+ /*
+ * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+ * with EBADF.
+ */
+ ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EBADF);
+}
+
+/*
+ * Test that process_madvise() rejects vlen > UIO_MAXIOV.
+ * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
+ */
+TEST_F(process_madvise, invalid_vlen)
+{
+ const unsigned long pagesize = self->page_size;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Currently, only a flag
+ * value of 0 is supported. This test is reserved for the future, e.g., if
+ * synchronous flags are added.
+ */
+TEST_F(process_madvise, flag)
+{
+ const unsigned long pagesize = self->page_size;
+ unsigned int invalid_flag;
+ int pidfd = self->pidfd;
+ struct iovec vec;
+ char *map;
+ ssize_t ret;
+
+ map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+ 0);
+ if (map == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+
+ vec.iov_base = map;
+ vec.iov_len = pagesize;
+
+ invalid_flag = 0x80000000;
+
+ ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index a38c984103ce..471e539d82b8 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
test pagemap_scan IOCTL
- pfnmap
tests for VM_PFNMAP handling
+- process_madv
+ test for process_madv
- cow
test copy-on-write semantics
- thp
@@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+# PROCESS_MADV test
+CATEGORY="process_madv" run_test ./process_madv
+
CATEGORY="vma_merge" run_test ./merge
if [ -x ./memfd_secret ]
diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index c5b01e1bd4c7..a7e790af38ff 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -35,24 +35,7 @@ failfunc=ktap_test_fail
if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then
optargs+=('--tolerance_usecs=14000')
-
- # xfail tests that are known flaky with dbg config, not fixable.
- # still run them for coverage (and expect 100% pass without dbg).
- declare -ar xfail_list=(
- "tcp_blocking_blocking-connect.pkt"
- "tcp_blocking_blocking-read.pkt"
- "tcp_eor_no-coalesce-retrans.pkt"
- "tcp_fast_recovery_prr-ss.*.pkt"
- "tcp_sack_sack-route-refresh-ip-tos.pkt"
- "tcp_slow_start_slow-start-after-win-update.pkt"
- "tcp_timestamping.*.pkt"
- "tcp_user_timeout_user-timeout-probe.pkt"
- "tcp_zerocopy_cl.*.pkt"
- "tcp_zerocopy_epoll_.*.pkt"
- "tcp_tcp_info_tcp-info-.*-limited.pkt"
- )
- readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$"
- [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail
+ failfunc=ktap_test_xfail
fi
ktap_print_header
diff --git a/tools/testing/selftests/net/test_neigh.sh b/tools/testing/selftests/net/test_neigh.sh
index 388056472b5b..7c594bf6ead0 100755
--- a/tools/testing/selftests/net/test_neigh.sh
+++ b/tools/testing/selftests/net/test_neigh.sh
@@ -289,11 +289,11 @@ extern_valid_common()
orig_base_reachable=$(ip -j ntable show name "$tbl_name" | jq '.[] | select(has("thresh1")) | .["base_reachable"]')
run_cmd "ip ntable change name $tbl_name thresh1 10 base_reachable 10000"
orig_gc_stale=$(ip -n "$ns1" -j ntable show name "$tbl_name" dev veth0 | jq '.[]["gc_stale"]')
- run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 5000"
- # Wait orig_base_reachable/2 for the new interval to take effect.
- run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))"
+ run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 1000"
run_cmd "ip -n $ns1 neigh add $ip_addr lladdr $mac nud stale dev veth0 extern_valid"
run_cmd "ip -n $ns1 neigh add ${subnet}3 lladdr $mac nud stale dev veth0"
+ # Wait orig_base_reachable/2 for the new interval to take effect.
+ run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))"
for i in {1..20}; do
run_cmd "ip -n $ns1 neigh add ${subnet}$((i + 4)) nud none dev veth0"
done
diff --git a/tools/testing/selftests/net/vlan_hw_filter.sh b/tools/testing/selftests/net/vlan_hw_filter.sh
index 0fb56baf28e4..e195d5cab6f7 100755
--- a/tools/testing/selftests/net/vlan_hw_filter.sh
+++ b/tools/testing/selftests/net/vlan_hw_filter.sh
@@ -55,10 +55,10 @@ test_vlan0_del_crash_01() {
ip netns exec ${NETNS} ip link add bond0 type bond mode 0
ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
- ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link set dev bond0 up
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
- ip netns exec ${NETNS} ifconfig bond0 down
- ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link set dev bond0 down
+ ip netns exec ${NETNS} ip link set dev bond0 up
ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
cleanup
}
@@ -68,11 +68,11 @@ test_vlan0_del_crash_02() {
setup
ip netns exec ${NETNS} ip link add bond0 type bond mode 0
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
- ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link set dev bond0 up
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
- ip netns exec ${NETNS} ifconfig bond0 down
- ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link set dev bond0 down
+ ip netns exec ${NETNS} ip link set dev bond0 up
ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
cleanup
}
@@ -84,9 +84,9 @@ test_vlan0_del_crash_03() {
ip netns exec ${NETNS} ip link add bond0 type bond mode 0
ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
- ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link set dev bond0 up
ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
- ip netns exec ${NETNS} ifconfig bond0 down
+ ip netns exec ${NETNS} ip link set dev bond0 down
ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
cleanup
}
diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore
index ee93dc4969b8..4931b3b6bbd3 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -2,3 +2,4 @@
sigtrap_threads
remove_on_exec
watermark_signal
+mmap
diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index 70e3ff211278..2e5d85770dfe 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread
-TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal mmap
include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/mmap.c b/tools/testing/selftests/perf_events/mmap.c
new file mode 100644
index 000000000000..ea0427aac1f9
--- /dev/null
+++ b/tools/testing/selftests/perf_events/mmap.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+
+#include <dirent.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#include <linux/perf_event.h>
+
+#include "../kselftest_harness.h"
+
+#define RB_SIZE 0x3000
+#define AUX_SIZE 0x10000
+#define AUX_OFFS 0x4000
+
+#define HOLE_SIZE 0x1000
+
+/* Reserve space for rb, aux with space for shrink-beyond-vma testing. */
+#define REGION_SIZE (2 * RB_SIZE + 2 * AUX_SIZE)
+#define REGION_AUX_OFFS (2 * RB_SIZE)
+
+#define MAP_BASE 1
+#define MAP_AUX 2
+
+#define EVENT_SRC_DIR "/sys/bus/event_source/devices"
+
+FIXTURE(perf_mmap)
+{
+ int fd;
+ void *ptr;
+ void *region;
+};
+
+FIXTURE_VARIANT(perf_mmap)
+{
+ bool aux;
+ unsigned long ptr_size;
+};
+
+FIXTURE_VARIANT_ADD(perf_mmap, rb)
+{
+ .aux = false,
+ .ptr_size = RB_SIZE,
+};
+
+FIXTURE_VARIANT_ADD(perf_mmap, aux)
+{
+ .aux = true,
+ .ptr_size = AUX_SIZE,
+};
+
+static bool read_event_type(struct dirent *dent, __u32 *type)
+{
+ char typefn[512];
+ FILE *fp;
+ int res;
+
+ snprintf(typefn, sizeof(typefn), "%s/%s/type", EVENT_SRC_DIR, dent->d_name);
+ fp = fopen(typefn, "r");
+ if (!fp)
+ return false;
+
+ res = fscanf(fp, "%u", type);
+ fclose(fp);
+ return res > 0;
+}
+
+FIXTURE_SETUP(perf_mmap)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(attr),
+ .disabled = 1,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ };
+ struct perf_event_attr attr_ok = {};
+ unsigned int eacces = 0, map = 0;
+ struct perf_event_mmap_page *rb;
+ struct dirent *dent;
+ void *aux, *region;
+ DIR *dir;
+
+ self->ptr = NULL;
+
+ dir = opendir(EVENT_SRC_DIR);
+ if (!dir)
+ SKIP(return, "perf not available.");
+
+ region = mmap(NULL, REGION_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(region, MAP_FAILED);
+ self->region = region;
+
+ // Try to find a suitable event on this system
+ while ((dent = readdir(dir))) {
+ int fd;
+
+ if (!read_event_type(dent, &attr.type))
+ continue;
+
+ fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
+ if (fd < 0) {
+ if (errno == EACCES)
+ eacces++;
+ continue;
+ }
+
+ // Check whether the event supports mmap()
+ rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
+ if (rb == MAP_FAILED) {
+ close(fd);
+ continue;
+ }
+
+ if (!map) {
+ // Save the event in case that no AUX capable event is found
+ attr_ok = attr;
+ map = MAP_BASE;
+ }
+
+ if (!variant->aux)
+ continue;
+
+ rb->aux_offset = AUX_OFFS;
+ rb->aux_size = AUX_SIZE;
+
+ // Check whether it supports a AUX buffer
+ aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, AUX_OFFS);
+ if (aux == MAP_FAILED) {
+ munmap(rb, RB_SIZE);
+ close(fd);
+ continue;
+ }
+
+ attr_ok = attr;
+ map = MAP_AUX;
+ munmap(aux, AUX_SIZE);
+ munmap(rb, RB_SIZE);
+ close(fd);
+ break;
+ }
+ closedir(dir);
+
+ if (!map) {
+ if (!eacces)
+ SKIP(return, "No mappable perf event found.");
+ else
+ SKIP(return, "No permissions for perf_event_open()");
+ }
+
+ self->fd = syscall(SYS_perf_event_open, &attr_ok, 0, -1, -1, 0);
+ ASSERT_NE(self->fd, -1);
+
+ rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, self->fd, 0);
+ ASSERT_NE(rb, MAP_FAILED);
+
+ if (!variant->aux) {
+ self->ptr = rb;
+ return;
+ }
+
+ if (map != MAP_AUX)
+ SKIP(return, "No AUX event found.");
+
+ rb->aux_offset = AUX_OFFS;
+ rb->aux_size = AUX_SIZE;
+ aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, self->fd, AUX_OFFS);
+ ASSERT_NE(aux, MAP_FAILED);
+ self->ptr = aux;
+}
+
+FIXTURE_TEARDOWN(perf_mmap)
+{
+ ASSERT_EQ(munmap(self->region, REGION_SIZE), 0);
+ if (self->fd != -1)
+ ASSERT_EQ(close(self->fd), 0);
+}
+
+TEST_F(perf_mmap, remap)
+{
+ void *tmp, *ptr = self->ptr;
+ unsigned long size = variant->ptr_size;
+
+ // Test the invalid remaps
+ ASSERT_EQ(mremap(ptr, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+ ASSERT_EQ(mremap(ptr + HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+ ASSERT_EQ(mremap(ptr + size - HOLE_SIZE, HOLE_SIZE, size, MREMAP_MAYMOVE), MAP_FAILED);
+ // Shrink the end of the mapping such that we only unmap past end of the VMA,
+ // which should succeed and poke a hole into the PROT_NONE region
+ ASSERT_NE(mremap(ptr + size - HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
+
+ // Remap the whole buffer to a new address
+ tmp = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(tmp, MAP_FAILED);
+
+ // Try splitting offset 1 hole size into VMA, this should fail
+ ASSERT_EQ(mremap(ptr + HOLE_SIZE, size - HOLE_SIZE, size - HOLE_SIZE,
+ MREMAP_MAYMOVE | MREMAP_FIXED, tmp), MAP_FAILED);
+ // Remapping the whole thing should succeed fine
+ ptr = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tmp);
+ ASSERT_EQ(ptr, tmp);
+ ASSERT_EQ(munmap(tmp, size), 0);
+}
+
+TEST_F(perf_mmap, unmap)
+{
+ unsigned long size = variant->ptr_size;
+
+ // Try to poke holes into the mappings
+ ASSERT_NE(munmap(self->ptr, HOLE_SIZE), 0);
+ ASSERT_NE(munmap(self->ptr + HOLE_SIZE, HOLE_SIZE), 0);
+ ASSERT_NE(munmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE), 0);
+}
+
+TEST_F(perf_mmap, map)
+{
+ unsigned long size = variant->ptr_size;
+
+ // Try to poke holes into the mappings by mapping anonymous memory over it
+ ASSERT_EQ(mmap(self->ptr, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+ ASSERT_EQ(mmap(self->ptr + HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+ ASSERT_EQ(mmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore
index b7dde152e75a..f6be8efd57ea 100644
--- a/tools/testing/selftests/ptrace/.gitignore
+++ b/tools/testing/selftests/ptrace/.gitignore
@@ -3,3 +3,4 @@ get_syscall_info
get_set_sud
peeksiginfo
vmaccess
+set_syscall_info
diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
index a40097232967..ba58589a1145 100644
--- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
+++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
@@ -32,12 +32,12 @@ void workload_hint_exit(int signum)
fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
if (fd < 0) {
- perror("Unable to open workload type feature enable file\n");
+ perror("Unable to open workload type feature enable file");
exit(1);
}
if (write(fd, "0\n", 2) < 0) {
- perror("Can't disable workload hints\n");
+ perror("Can't disable workload hints");
exit(1);
}
@@ -68,16 +68,14 @@ int main(int argc, char **argv)
exit(1);
sprintf(delay_str, "%s\n", argv[1]);
-
- sprintf(delay_str, "%s\n", argv[1]);
fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR);
if (fd < 0) {
- perror("Unable to open workload notification delay\n");
+ perror("Unable to open workload notification delay");
exit(1);
}
if (write(fd, delay_str, strlen(delay_str)) < 0) {
- perror("Can't set delay\n");
+ perror("Can't set delay");
exit(1);
}
@@ -94,12 +92,12 @@ int main(int argc, char **argv)
/* Enable feature via sysfs knob */
fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
if (fd < 0) {
- perror("Unable to open workload type feature enable file\n");
+ perror("Unable to open workload type feature enable file");
exit(1);
}
if (write(fd, "1\n", 2) < 0) {
- perror("Can't enable workload hints\n");
+ perror("Can't enable workload hints");
exit(1);
}
@@ -110,7 +108,7 @@ int main(int argc, char **argv)
while (1) {
fd = open(WORKLOAD_TYPE_INDEX_ATTRIBUTE, O_RDONLY);
if (fd < 0) {
- perror("Unable to open workload type file\n");
+ perror("Unable to open workload type file");
exit(1);
}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index a838c37f93e5..3639aa8dd2b0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr;
#define CAP_IPC_LOCK 14
#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#define VM_SEALED_BIT 42
+#define VM_SEALED BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED VM_NONE
#endif
#define FIRST_USER_ADDRESS 0UL