diff options
Diffstat (limited to 'tools')
27 files changed, 2598 insertions, 344 deletions
diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 11def1ad046c..20bbd461515e 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays procacct +PROGS := getdelays procacct delaytop all: $(PROGS) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c new file mode 100644 index 000000000000..9afb1ffc00ba --- /dev/null +++ b/tools/accounting/delaytop.c @@ -0,0 +1,862 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * delaytop.c - system-wide delay monitoring tool. + * + * This tool provides real-time monitoring and statistics of + * system, container, and task-level delays, including CPU, + * memory, IO, and IRQ. It supports both interactive (top-like), + * and can output delay information for the whole system, specific + * containers (cgroups), or individual tasks (PIDs). + * + * Key features: + * - Collects per-task delay accounting statistics via taskstats. + * - Collects system-wide PSI information. + * - Supports sorting, filtering. + * - Supports both interactive (screen refresh). + * + * Copyright (C) Fan Yu, ZTE Corp. 2025 + * Copyright (C) Wang Yaxin, ZTE Corp. 2025 + * + * Compile with + * gcc -I/usr/src/linux/include delaytop.c -o delaytop + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <getopt.h> +#include <signal.h> +#include <time.h> +#include <dirent.h> +#include <ctype.h> +#include <stdbool.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <termios.h> +#include <limits.h> +#include <linux/genetlink.h> +#include <linux/taskstats.h> +#include <linux/cgroupstats.h> + +#define PSI_CPU_SOME "/proc/pressure/cpu" +#define PSI_CPU_FULL "/proc/pressure/cpu" +#define PSI_MEMORY_SOME "/proc/pressure/memory" +#define PSI_MEMORY_FULL "/proc/pressure/memory" +#define PSI_IO_SOME "/proc/pressure/io" +#define PSI_IO_FULL "/proc/pressure/io" +#define PSI_IRQ_FULL "/proc/pressure/irq" + +#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) + +#define TASK_COMM_LEN 16 +#define MAX_MSG_SIZE 1024 +#define MAX_TASKS 1000 +#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field +#define BOOL_FPRINT(stream, fmt, ...) \ +({ \ + int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ + ret >= 0; \ +}) +#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + char sort_field; /* Field to sort by */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ +}; + +/* PSI statistics structure */ +struct psi_stats { + double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300; + unsigned long long cpu_some_total; + double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300; + unsigned long long cpu_full_total; + double memory_some_avg10, memory_some_avg60, memory_some_avg300; + unsigned long long memory_some_total; + double memory_full_avg10, memory_full_avg60, memory_full_avg300; + unsigned long long memory_full_total; + double io_some_avg10, io_some_avg60, io_some_avg300; + unsigned long long io_some_total; + double io_full_avg10, io_full_avg60, io_full_avg300; + unsigned long long io_full_total; + double irq_full_avg10, irq_full_avg60, irq_full_avg300; + unsigned long long irq_full_total; +}; + +/* Task delay information structure */ +struct task_info { + int pid; + int tgid; + char command[TASK_COMM_LEN]; + unsigned long long cpu_count; + unsigned long long cpu_delay_total; + unsigned long long blkio_count; + unsigned long long blkio_delay_total; + unsigned long long swapin_count; + unsigned long long swapin_delay_total; + unsigned long long freepages_count; + unsigned long long freepages_delay_total; + unsigned long long thrashing_count; + unsigned long long thrashing_delay_total; + unsigned long long compact_count; + unsigned long long compact_delay_total; + unsigned long long wpcopy_count; + unsigned long long wpcopy_delay_total; + unsigned long long irq_count; + unsigned long long irq_delay_total; +}; + +/* Container statistics structure */ +struct container_stats { + int nr_sleeping; /* Number of sleeping processes */ + int nr_running; /* Number of running processes */ + int nr_stopped; /* Number of stopped processes */ + int nr_uninterruptible; /* Number of uninterruptible processes */ + int nr_io_wait; /* Number of processes in IO wait */ +}; + +/* Global variables */ +static struct config cfg; +static struct psi_stats psi; +static struct task_info tasks[MAX_TASKS]; +static int task_count; +static int running = 1; +static struct container_stats container_stats; + +/* Netlink socket variables */ +static int nl_sd = -1; +static int family_id; + +/* Set terminal to non-canonical mode for q-to-quit */ +static struct termios orig_termios; +static void enable_raw_mode(void) +{ + struct termios raw; + + tcgetattr(STDIN_FILENO, &orig_termios); + raw = orig_termios; + raw.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); +} +static void disable_raw_mode(void) +{ + tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); +} + +/* Display usage information and command line options */ +static void usage(void) +{ + printf("Usage: delaytop [Options]\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n"); + exit(0); +} + +/* Parse command line arguments and set configuration */ +static void parse_args(int argc, char **argv) +{ + int c; + struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"delay", required_argument, 0, 'd'}, + {"iterations", required_argument, 0, 'n'}, + {"pid", required_argument, 0, 'p'}, + {"once", no_argument, 0, 'o'}, + {"processes", required_argument, 0, 'P'}, + {"container", required_argument, 0, 'C'}, + {0, 0, 0, 0} + }; + + /* Set defaults */ + cfg.delay = 2; + cfg.iterations = 0; + cfg.max_processes = 20; + cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.output_one_time = 0; + cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ + cfg.container_path = NULL; + + while (1) { + int option_index = 0; + + c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + usage(); + break; + case 'd': + cfg.delay = atoi(optarg); + if (cfg.delay < 1) { + fprintf(stderr, "Error: delay must be >= 1.\n"); + exit(1); + } + break; + case 'n': + cfg.iterations = atoi(optarg); + if (cfg.iterations < 0) { + fprintf(stderr, "Error: iterations must be >= 0.\n"); + exit(1); + } + break; + case 'p': + cfg.monitor_pid = atoi(optarg); + if (cfg.monitor_pid < 1) { + fprintf(stderr, "Error: pid must be >= 1.\n"); + exit(1); + } + break; + case 'o': + cfg.output_one_time = 1; + break; + case 'P': + cfg.max_processes = atoi(optarg); + if (cfg.max_processes < 1) { + fprintf(stderr, "Error: processes must be >= 1.\n"); + exit(1); + } + if (cfg.max_processes > MAX_TASKS) { + fprintf(stderr, "Warning: processes capped to %d.\n", + MAX_TASKS); + cfg.max_processes = MAX_TASKS; + } + break; + case 'C': + cfg.container_path = strdup(optarg); + break; + default: + fprintf(stderr, "Try 'delaytop --help' for more information.\n"); + exit(1); + } + } +} + +/* Create a raw netlink socket and bind */ +static int create_nl_socket(void) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -1; + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { + fprintf(stderr, "Failed to bind socket when create nl_socket\n"); + close(fd); + return -1; + } + + return fd; +} + +/* Send a command via netlink */ +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct sockaddr_nl nladdr; + struct nlattr *na; + int r, buflen; + char *buf; + + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + +/* Get family ID for taskstats via netlink */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + char name[100]; + + strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); + name[sizeof(name) - 1] = '\0'; + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) { + fprintf(stderr, "Failed to send cmd for family id\n"); + return 0; + } + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) { + fprintf(stderr, "Failed to receive response for family id\n"); + return 0; + } + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + return id; +} + +static void read_psi_stats(void) +{ + FILE *fp; + char line[256]; + int ret = 0; + /* Zero all fields */ + memset(&psi, 0, sizeof(psi)); + /* CPU pressure */ + fp = fopen(PSI_CPU_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.cpu_some_avg10, &psi.cpu_some_avg60, + &psi.cpu_some_avg300, &psi.cpu_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse CPU some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.cpu_full_avg10, &psi.cpu_full_avg60, + &psi.cpu_full_avg300, &psi.cpu_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse CPU full PSI data\n"); + } + } + fclose(fp); + } + /* Memory pressure */ + fp = fopen(PSI_MEMORY_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.memory_some_avg10, &psi.memory_some_avg60, + &psi.memory_some_avg300, &psi.memory_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse Memory some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.memory_full_avg10, &psi.memory_full_avg60, + &psi.memory_full_avg300, &psi.memory_full_total); + } + if (ret != 4) + fprintf(stderr, "Failed to parse Memory full PSI data\n"); + } + fclose(fp); + } + /* IO pressure */ + fp = fopen(PSI_IO_SOME, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "some", 4) == 0) { + ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.io_some_avg10, &psi.io_some_avg60, + &psi.io_some_avg300, &psi.io_some_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IO some PSI data\n"); + } else if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.io_full_avg10, &psi.io_full_avg60, + &psi.io_full_avg300, &psi.io_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IO full PSI data\n"); + } + } + fclose(fp); + } + /* IRQ pressure (only full) */ + fp = fopen(PSI_IRQ_FULL, "r"); + if (fp) { + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "full", 4) == 0) { + ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", + &psi.irq_full_avg10, &psi.irq_full_avg60, + &psi.irq_full_avg300, &psi.irq_full_total); + if (ret != 4) + fprintf(stderr, "Failed to parse IRQ full PSI data\n"); + } + } + fclose(fp); + } +} + +static int read_comm(int pid, char *comm_buf, size_t buf_size) +{ + char path[64]; + int ret = -1; + size_t len; + FILE *fp; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fp = fopen(path, "r"); + if (!fp) { + fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid); + return ret; + } + + if (fgets(comm_buf, buf_size, fp)) { + len = strlen(comm_buf); + if (len > 0 && comm_buf[len - 1] == '\n') + comm_buf[len - 1] = '\0'; + ret = 0; + } + + fclose(fp); + + return ret; +} + +static void fetch_and_fill_task_info(int pid, const char *comm) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } resp; + struct taskstats stats; + struct nlattr *nested; + struct nlattr *na; + int nested_len; + int nl_len; + int rc; + + /* Send request for task stats */ + if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { + fprintf(stderr, "Failed to send request for task stats\n"); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for task stats\n"); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { + nested = (struct nlattr *) NLA_DATA(na); + nested_len = NLA_PAYLOAD(na->nla_len); + while (nested_len > 0) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(&stats, NLA_DATA(nested), sizeof(stats)); + if (task_count < MAX_TASKS) { + tasks[task_count].pid = pid; + tasks[task_count].tgid = pid; + strncpy(tasks[task_count].command, comm, + TASK_COMM_LEN - 1); + tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; + SET_TASK_STAT(task_count, cpu_count); + SET_TASK_STAT(task_count, cpu_delay_total); + SET_TASK_STAT(task_count, blkio_count); + SET_TASK_STAT(task_count, blkio_delay_total); + SET_TASK_STAT(task_count, swapin_count); + SET_TASK_STAT(task_count, swapin_delay_total); + SET_TASK_STAT(task_count, freepages_count); + SET_TASK_STAT(task_count, freepages_delay_total); + SET_TASK_STAT(task_count, thrashing_count); + SET_TASK_STAT(task_count, thrashing_delay_total); + SET_TASK_STAT(task_count, compact_count); + SET_TASK_STAT(task_count, compact_delay_total); + SET_TASK_STAT(task_count, wpcopy_count); + SET_TASK_STAT(task_count, wpcopy_delay_total); + SET_TASK_STAT(task_count, irq_count); + SET_TASK_STAT(task_count, irq_delay_total); + task_count++; + } + break; + } + nested_len -= NLA_ALIGN(nested->nla_len); + nested = NLA_NEXT(nested); + } + } + nl_len -= NLA_ALIGN(na->nla_len); + na = NLA_NEXT(na); + } + return; +} + +static void get_task_delays(void) +{ + char comm[TASK_COMM_LEN]; + struct dirent *entry; + DIR *dir; + int pid; + + task_count = 0; + if (cfg.monitor_pid > 0) { + if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) + fetch_and_fill_task_info(cfg.monitor_pid, comm); + return; + } + + dir = opendir("/proc"); + if (!dir) { + fprintf(stderr, "Error opening /proc directory\n"); + return; + } + + while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { + if (!isdigit(entry->d_name[0])) + continue; + pid = atoi(entry->d_name); + if (pid == 0) + continue; + if (read_comm(pid, comm, sizeof(comm)) != 0) + continue; + fetch_and_fill_task_info(pid, comm); + } + closedir(dir); +} + +/* Calculate average delay in milliseconds */ +static double average_ms(unsigned long long total, unsigned long long count) +{ + if (count == 0) + return 0; + return (double)total / 1000000.0 / count; +} + +/* Comparison function for sorting tasks */ +static int compare_tasks(const void *a, const void *b) +{ + const struct task_info *t1 = (const struct task_info *)a; + const struct task_info *t2 = (const struct task_info *)b; + double avg1, avg2; + + switch (cfg.sort_field) { + case 'c': /* CPU */ + avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); + avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + + default: + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + } +} + +/* Sort tasks by selected field */ +static void sort_tasks(void) +{ + if (task_count > 0) + qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); +} + +/* Get container statistics via cgroupstats */ +static void get_container_stats(void) +{ + int rc, cfd; + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } req, resp; + struct nlattr *na; + int nl_len; + struct cgroupstats stats; + + /* Check if container path is set */ + if (!cfg.container_path) + return; + + /* Open container cgroup */ + cfd = open(cfg.container_path, O_RDONLY); + if (cfd < 0) { + fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); + return; + } + + /* Send request for container stats */ + if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { + fprintf(stderr, "Failed to send request for container stats\n"); + close(cfd); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for container stats\n"); + close(cfd); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { + /* Get the cgroupstats structure */ + memcpy(&stats, NLA_DATA(na), sizeof(stats)); + + /* Fill container stats */ + container_stats.nr_sleeping = stats.nr_sleeping; + container_stats.nr_running = stats.nr_running; + container_stats.nr_stopped = stats.nr_stopped; + container_stats.nr_uninterruptible = stats.nr_uninterruptible; + container_stats.nr_io_wait = stats.nr_io_wait; + break; + } + nl_len -= NLA_ALIGN(na->nla_len); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + } + + close(cfd); +} + +/* Display results to stdout or log file */ +static void display_results(void) +{ + time_t now = time(NULL); + struct tm *tm_now = localtime(&now); + FILE *out = stdout; + char timestamp[32]; + bool suc = true; + int i, count; + + /* Clear terminal screen */ + suc &= BOOL_FPRINT(out, "\033[H\033[J"); + + /* PSI output (one-line, no cat style) */ + suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n"); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU some:", + psi.cpu_some_avg10, + psi.cpu_some_avg60, + psi.cpu_some_avg300, + psi.cpu_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU full:", + psi.cpu_full_avg10, + psi.cpu_full_avg60, + psi.cpu_full_avg300, + psi.cpu_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory full:", + psi.memory_full_avg10, + psi.memory_full_avg60, + psi.memory_full_avg300, + psi.memory_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory some:", + psi.memory_some_avg10, + psi.memory_some_avg60, + psi.memory_some_avg300, + psi.memory_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO full:", + psi.io_full_avg10, + psi.io_full_avg60, + psi.io_full_avg300, + psi.io_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO some:", + psi.io_some_avg10, + psi.io_some_avg60, + psi.io_some_avg300, + psi.io_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IRQ full:", + psi.irq_full_avg10, + psi.irq_full_avg60, + psi.irq_full_avg300, + psi.irq_full_total / 1000); + + if (cfg.container_path) { + suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); + suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ", + container_stats.nr_running, container_stats.nr_sleeping); + suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", + container_stats.nr_stopped, container_stats.nr_uninterruptible, + container_stats.nr_io_wait); + } + suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n", + cfg.max_processes); + suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); + suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", + "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", + "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)"); + + suc &= BOOL_FPRINT(out, "-----------------------------------------------"); + suc &= BOOL_FPRINT(out, "----------------------------------------------\n"); + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; + + for (i = 0; i < count; i++) { + suc &= BOOL_FPRINT(out, "%5d %5d %-15s", + tasks[i].pid, tasks[i].tgid, tasks[i].command); + suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", + average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), + average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), + average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), + average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), + average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), + average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), + average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), + average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + } + + suc &= BOOL_FPRINT(out, "\n"); + + if (!suc) + perror("Error writing to output"); +} + +/* Main function */ +int main(int argc, char **argv) +{ + int iterations = 0; + int use_q_quit = 0; + + /* Parse command line arguments */ + parse_args(argc, argv); + + /* Setup netlink socket */ + nl_sd = create_nl_socket(); + if (nl_sd < 0) { + fprintf(stderr, "Error creating netlink socket\n"); + exit(1); + } + + /* Get family ID for taskstats via netlink */ + family_id = get_family_id(nl_sd); + if (!family_id) { + fprintf(stderr, "Error getting taskstats family ID\n"); + close(nl_sd); + exit(1); + } + + if (!cfg.output_one_time) { + use_q_quit = 1; + enable_raw_mode(); + printf("Press 'q' to quit.\n"); + fflush(stdout); + } + + /* Main loop */ + while (running) { + /* Read PSI statistics */ + read_psi_stats(); + + /* Get container stats if container path provided */ + if (cfg.container_path) + get_container_stats(); + + /* Get task delays */ + get_task_delays(); + + /* Sort tasks */ + sort_tasks(); + + /* Display results to stdout or log file */ + display_results(); + + /* Check for iterations */ + if (cfg.iterations > 0 && ++iterations >= cfg.iterations) + break; + + /* Exit if output_one_time is set */ + if (cfg.output_one_time) + break; + + /* Check for 'q' key to quit */ + if (use_q_quit) { + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + char ch = 0; + + read(STDIN_FILENO, &ch, 1); + if (ch == 'q' || ch == 'Q') { + running = 0; + break; + } + } + } else { + sleep(cfg.delay); + } + } + + /* Restore terminal mode */ + if (use_q_quit) + disable_raw_mode(); + + /* Cleanup */ + close(nl_sd); + if (cfg.container_path) + free(cfg.container_path); + + return 0; +} diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 3feac0482fe9..21cb3c3d1331 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -194,75 +194,108 @@ static int get_family_id(int sd) #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) #define delay_ms(t) (t / 1000000ULL) +/* + * Version compatibility note: + * Field availability depends on taskstats version (t->version), + * corresponding to TASKSTATS_VERSION in kernel headers + * see include/uapi/linux/taskstats.h + * + * Version feature mapping: + * version >= 11 - supports COMPACT statistics + * version >= 13 - supports WPCOPY statistics + * version >= 14 - supports IRQ statistics + * version >= 16 - supports *_max and *_min delay statistics + * + * Always verify version before accessing version-dependent fields + * to maintain backward compatibility. + */ +#define PRINT_CPU_DELAY(version, t) \ + do { \ + if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average", "delay max", "delay min"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ + delay_ms((double)(t)->cpu_delay_max), \ + delay_ms((double)(t)->cpu_delay_min)); \ + } else { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count)); \ + } \ + } while (0) +#define PRINT_FILED_DELAY(name, version, t, count, total, max, min) \ + do { \ + if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min)); \ + } else { \ + printf("%-10s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average"); \ + printf(" %15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count)); \ + } \ + } while (0) + static void print_delayacct(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "IO %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "SWAP %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "RECLAIM %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "THRASHING%12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "COMPACT %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "WPCOPY %12s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" - "IRQ %15s%15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms%13.6fms\n", - "count", "real total", "virtual total", - "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->cpu_count, - (unsigned long long)t->cpu_run_real_total, - (unsigned long long)t->cpu_run_virtual_total, - (unsigned long long)t->cpu_delay_total, - average_ms((double)t->cpu_delay_total, t->cpu_count), - delay_ms((double)t->cpu_delay_max), - delay_ms((double)t->cpu_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->blkio_count, - (unsigned long long)t->blkio_delay_total, - average_ms((double)t->blkio_delay_total, t->blkio_count), - delay_ms((double)t->blkio_delay_max), - delay_ms((double)t->blkio_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->swapin_count, - (unsigned long long)t->swapin_delay_total, - average_ms((double)t->swapin_delay_total, t->swapin_count), - delay_ms((double)t->swapin_delay_max), - delay_ms((double)t->swapin_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->freepages_count, - (unsigned long long)t->freepages_delay_total, - average_ms((double)t->freepages_delay_total, t->freepages_count), - delay_ms((double)t->freepages_delay_max), - delay_ms((double)t->freepages_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->thrashing_count, - (unsigned long long)t->thrashing_delay_total, - average_ms((double)t->thrashing_delay_total, t->thrashing_count), - delay_ms((double)t->thrashing_delay_max), - delay_ms((double)t->thrashing_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->compact_count, - (unsigned long long)t->compact_delay_total, - average_ms((double)t->compact_delay_total, t->compact_count), - delay_ms((double)t->compact_delay_max), - delay_ms((double)t->compact_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->wpcopy_count, - (unsigned long long)t->wpcopy_delay_total, - average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), - delay_ms((double)t->wpcopy_delay_max), - delay_ms((double)t->wpcopy_delay_min), - "count", "delay total", "delay average", "delay max", "delay min", - (unsigned long long)t->irq_count, - (unsigned long long)t->irq_delay_total, - average_ms((double)t->irq_delay_total, t->irq_count), - delay_ms((double)t->irq_delay_max), - delay_ms((double)t->irq_delay_min)); + printf("\n\n"); + + PRINT_CPU_DELAY(t->version, t); + + PRINT_FILED_DELAY("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min); + + PRINT_FILED_DELAY("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min); + + PRINT_FILED_DELAY("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min); + + PRINT_FILED_DELAY("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min); + + if (t->version >= 11) { + PRINT_FILED_DELAY("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min); + } + + if (t->version >= 13) { + PRINT_FILED_DELAY("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min); + } + + if (t->version >= 14) { + PRINT_FILED_DELAY("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min); + } } static void task_context_switch_counts(struct taskstats *t) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index fb11108aaf42..3340def58d01 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -47,10 +47,11 @@ name as necessary to disambiguate it from others is necessary. Note that option MSRs are read as 64-bits, u32 truncates the displayed value to 32-bits. default: u64 - format: {\fBraw\fP | \fBdelta\fP | \fBpercent\fP} + format: {\fBraw\fP | \fBdelta\fP | \fBpercent\fP | \fBaverage\fP} 'raw' shows the MSR contents in hex. 'delta' shows the difference in values during the measurement interval. 'percent' shows the delta as a percentage of the cycles elapsed. + 'average' similar to raw, but also averaged for node/package summaries (or when using -S). default: delta name: "name_string" @@ -186,6 +187,14 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP +\fBTotl%C0\fP Weighted percentage of time that CPUs are busy. If N CPUs are busy during an interval, the percentage is N * 100%. +.PP +\fBAny%C0\fP Percentage of time that at least one CPU is busy. +.PP +\fBGFX%C0\fP Percentage of time that at least one GFX compute engine is busy. +.PP +\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute enginer is busy. +.PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP \fBPkgWatt\fP Watts consumed by the whole package. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5230e072e414..72a280e7a9d5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -67,6 +67,7 @@ #include <stdbool.h> #include <assert.h> #include <linux/kernel.h> +#include <limits.h> #define UNUSED(x) (void)(x) @@ -194,6 +195,7 @@ struct msr_counter bic[] = { { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "L3", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, @@ -209,91 +211,238 @@ struct msr_counter bic[] = { { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, }; -#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) -#define BIC_USEC (1ULL << 0) -#define BIC_TOD (1ULL << 1) -#define BIC_Package (1ULL << 2) -#define BIC_Node (1ULL << 3) -#define BIC_Avg_MHz (1ULL << 4) -#define BIC_Busy (1ULL << 5) -#define BIC_Bzy_MHz (1ULL << 6) -#define BIC_TSC_MHz (1ULL << 7) -#define BIC_IRQ (1ULL << 8) -#define BIC_SMI (1ULL << 9) -#define BIC_cpuidle (1ULL << 10) -#define BIC_CPU_c1 (1ULL << 11) -#define BIC_CPU_c3 (1ULL << 12) -#define BIC_CPU_c6 (1ULL << 13) -#define BIC_CPU_c7 (1ULL << 14) -#define BIC_ThreadC (1ULL << 15) -#define BIC_CoreTmp (1ULL << 16) -#define BIC_CoreCnt (1ULL << 17) -#define BIC_PkgTmp (1ULL << 18) -#define BIC_GFX_rc6 (1ULL << 19) -#define BIC_GFXMHz (1ULL << 20) -#define BIC_Pkgpc2 (1ULL << 21) -#define BIC_Pkgpc3 (1ULL << 22) -#define BIC_Pkgpc6 (1ULL << 23) -#define BIC_Pkgpc7 (1ULL << 24) -#define BIC_Pkgpc8 (1ULL << 25) -#define BIC_Pkgpc9 (1ULL << 26) -#define BIC_Pkgpc10 (1ULL << 27) -#define BIC_CPU_LPI (1ULL << 28) -#define BIC_SYS_LPI (1ULL << 29) -#define BIC_PkgWatt (1ULL << 30) -#define BIC_CorWatt (1ULL << 31) -#define BIC_GFXWatt (1ULL << 32) -#define BIC_PkgCnt (1ULL << 33) -#define BIC_RAMWatt (1ULL << 34) -#define BIC_PKG__ (1ULL << 35) -#define BIC_RAM__ (1ULL << 36) -#define BIC_Pkg_J (1ULL << 37) -#define BIC_Cor_J (1ULL << 38) -#define BIC_GFX_J (1ULL << 39) -#define BIC_RAM_J (1ULL << 40) -#define BIC_Mod_c6 (1ULL << 41) -#define BIC_Totl_c0 (1ULL << 42) -#define BIC_Any_c0 (1ULL << 43) -#define BIC_GFX_c0 (1ULL << 44) -#define BIC_CPUGFX (1ULL << 45) -#define BIC_Core (1ULL << 46) -#define BIC_CPU (1ULL << 47) -#define BIC_APIC (1ULL << 48) -#define BIC_X2APIC (1ULL << 49) -#define BIC_Die (1ULL << 50) -#define BIC_GFXACTMHz (1ULL << 51) -#define BIC_IPC (1ULL << 52) -#define BIC_CORE_THROT_CNT (1ULL << 53) -#define BIC_UNCORE_MHZ (1ULL << 54) -#define BIC_SAM_mc6 (1ULL << 55) -#define BIC_SAMMHz (1ULL << 56) -#define BIC_SAMACTMHz (1ULL << 57) -#define BIC_Diec6 (1ULL << 58) -#define BIC_SysWatt (1ULL << 59) -#define BIC_Sys_J (1ULL << 60) -#define BIC_NMI (1ULL << 61) -#define BIC_CPU_c1e (1ULL << 62) -#define BIC_pct_idle (1ULL << 63) - -#define BIC_GROUP_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die) -#define BIC_GROUP_THERMAL_PWR (BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__ | BIC_SysWatt) -#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) -#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle ) -#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle) -#define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) - -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle) - -unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_cpuidle | BIC_pct_idle | BIC_APIC | BIC_X2APIC; - -#define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) -#define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) -#define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) -#define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT) -#define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) -#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* n.b. bic_names must match the order in bic[], above */ +enum bic_names { + BIC_USEC, + BIC_TOD, + BIC_Package, + BIC_Node, + BIC_Avg_MHz, + BIC_Busy, + BIC_Bzy_MHz, + BIC_TSC_MHz, + BIC_IRQ, + BIC_SMI, + BIC_cpuidle, + BIC_CPU_c1, + BIC_CPU_c3, + BIC_CPU_c6, + BIC_CPU_c7, + BIC_ThreadC, + BIC_CoreTmp, + BIC_CoreCnt, + BIC_PkgTmp, + BIC_GFX_rc6, + BIC_GFXMHz, + BIC_Pkgpc2, + BIC_Pkgpc3, + BIC_Pkgpc6, + BIC_Pkgpc7, + BIC_Pkgpc8, + BIC_Pkgpc9, + BIC_Pkgpc10, + BIC_CPU_LPI, + BIC_SYS_LPI, + BIC_PkgWatt, + BIC_CorWatt, + BIC_GFXWatt, + BIC_PkgCnt, + BIC_RAMWatt, + BIC_PKG__, + BIC_RAM__, + BIC_Pkg_J, + BIC_Cor_J, + BIC_GFX_J, + BIC_RAM_J, + BIC_Mod_c6, + BIC_Totl_c0, + BIC_Any_c0, + BIC_GFX_c0, + BIC_CPUGFX, + BIC_Core, + BIC_CPU, + BIC_APIC, + BIC_X2APIC, + BIC_Die, + BIC_L3, + BIC_GFXACTMHz, + BIC_IPC, + BIC_CORE_THROT_CNT, + BIC_UNCORE_MHZ, + BIC_SAM_mc6, + BIC_SAMMHz, + BIC_SAMACTMHz, + BIC_Diec6, + BIC_SysWatt, + BIC_Sys_J, + BIC_NMI, + BIC_CPU_c1e, + BIC_pct_idle, + MAX_BIC +}; + +void print_bic_set(char *s, cpu_set_t *set) +{ + int i; + + assert(MAX_BIC < CPU_SETSIZE); + + printf("%s:", s); + + for (i = 0; i <= MAX_BIC; ++i) { + + if (CPU_ISSET(i, set)) { + assert(i < MAX_BIC); + printf(" %s", bic[i].name); + } + } + putchar('\n'); +} + +static cpu_set_t bic_group_topology; +static cpu_set_t bic_group_thermal_pwr; +static cpu_set_t bic_group_frequency; +static cpu_set_t bic_group_hw_idle; +static cpu_set_t bic_group_sw_idle; +static cpu_set_t bic_group_idle; +static cpu_set_t bic_group_other; +static cpu_set_t bic_group_disabled_by_default; +static cpu_set_t bic_enabled; +static cpu_set_t bic_present; + +/* modify */ +#define BIC_INIT(set) CPU_ZERO(set) + +#define SET_BIC(COUNTER_NUMBER, set) CPU_SET(COUNTER_NUMBER, set) +#define CLR_BIC(COUNTER_NUMBER, set) CPU_CLR(COUNTER_NUMBER, set) + +#define BIC_PRESENT(COUNTER_NUMBER) SET_BIC(COUNTER_NUMBER, &bic_present) +#define BIC_NOT_PRESENT(COUNTER_NUMBER) CPU_CLR(COUNTER_NUMBER, &bic_present) + +/* test */ +#define BIC_IS_ENABLED(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_enabled) +#define DO_BIC_READ(COUNTER_NUMBER) CPU_ISSET(COUNTER_NUMBER, &bic_present) +#define DO_BIC(COUNTER_NUMBER) (CPU_ISSET(COUNTER_NUMBER, &bic_enabled) && CPU_ISSET(COUNTER_NUMBER, &bic_present)) + +static void bic_set_all(cpu_set_t *set) +{ + int i; + + assert(MAX_BIC < CPU_SETSIZE); + + for (i = 0; i < MAX_BIC; ++i) + SET_BIC(i, set); +} + +/* + * bic_clear_bits() + * clear all the bits from "clr" in "dst" + */ +static void bic_clear_bits(cpu_set_t *dst, cpu_set_t *clr) +{ + int i; + + assert(MAX_BIC < CPU_SETSIZE); + + for (i = 0; i < MAX_BIC; ++i) + if (CPU_ISSET(i, clr)) + CLR_BIC(i, dst); +} + +static void bic_groups_init(void) +{ + BIC_INIT(&bic_group_topology); + SET_BIC(BIC_Package, &bic_group_topology); + SET_BIC(BIC_Node, &bic_group_topology); + SET_BIC(BIC_CoreCnt, &bic_group_topology); + SET_BIC(BIC_PkgCnt, &bic_group_topology); + SET_BIC(BIC_Core, &bic_group_topology); + SET_BIC(BIC_CPU, &bic_group_topology); + SET_BIC(BIC_Die, &bic_group_topology); + SET_BIC(BIC_L3, &bic_group_topology); + + BIC_INIT(&bic_group_thermal_pwr); + SET_BIC(BIC_CoreTmp, &bic_group_thermal_pwr); + SET_BIC(BIC_PkgTmp, &bic_group_thermal_pwr); + SET_BIC(BIC_PkgWatt, &bic_group_thermal_pwr); + SET_BIC(BIC_CorWatt, &bic_group_thermal_pwr); + SET_BIC(BIC_GFXWatt, &bic_group_thermal_pwr); + SET_BIC(BIC_RAMWatt, &bic_group_thermal_pwr); + SET_BIC(BIC_PKG__, &bic_group_thermal_pwr); + SET_BIC(BIC_RAM__, &bic_group_thermal_pwr); + SET_BIC(BIC_SysWatt, &bic_group_thermal_pwr); + + BIC_INIT(&bic_group_frequency); + SET_BIC(BIC_Avg_MHz, &bic_group_frequency); + SET_BIC(BIC_Busy, &bic_group_frequency); + SET_BIC(BIC_Bzy_MHz, &bic_group_frequency); + SET_BIC(BIC_TSC_MHz, &bic_group_frequency); + SET_BIC(BIC_GFXMHz, &bic_group_frequency); + SET_BIC(BIC_GFXACTMHz, &bic_group_frequency); + SET_BIC(BIC_SAMMHz, &bic_group_frequency); + SET_BIC(BIC_SAMACTMHz, &bic_group_frequency); + SET_BIC(BIC_UNCORE_MHZ, &bic_group_frequency); + + BIC_INIT(&bic_group_hw_idle); + SET_BIC(BIC_Busy, &bic_group_hw_idle); + SET_BIC(BIC_CPU_c1, &bic_group_hw_idle); + SET_BIC(BIC_CPU_c3, &bic_group_hw_idle); + SET_BIC(BIC_CPU_c6, &bic_group_hw_idle); + SET_BIC(BIC_CPU_c7, &bic_group_hw_idle); + SET_BIC(BIC_GFX_rc6, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc2, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc3, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc6, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc7, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc8, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc9, &bic_group_hw_idle); + SET_BIC(BIC_Pkgpc10, &bic_group_hw_idle); + SET_BIC(BIC_CPU_LPI, &bic_group_hw_idle); + SET_BIC(BIC_SYS_LPI, &bic_group_hw_idle); + SET_BIC(BIC_Mod_c6, &bic_group_hw_idle); + SET_BIC(BIC_Totl_c0, &bic_group_hw_idle); + SET_BIC(BIC_Any_c0, &bic_group_hw_idle); + SET_BIC(BIC_GFX_c0, &bic_group_hw_idle); + SET_BIC(BIC_CPUGFX, &bic_group_hw_idle); + SET_BIC(BIC_SAM_mc6, &bic_group_hw_idle); + SET_BIC(BIC_Diec6, &bic_group_hw_idle); + + BIC_INIT(&bic_group_sw_idle); + SET_BIC(BIC_Busy, &bic_group_sw_idle); + SET_BIC(BIC_cpuidle, &bic_group_sw_idle); + SET_BIC(BIC_pct_idle, &bic_group_sw_idle); + + BIC_INIT(&bic_group_idle); + CPU_OR(&bic_group_idle, &bic_group_idle, &bic_group_hw_idle); + SET_BIC(BIC_pct_idle, &bic_group_idle); + + BIC_INIT(&bic_group_other); + SET_BIC(BIC_IRQ, &bic_group_other); + SET_BIC(BIC_NMI, &bic_group_other); + SET_BIC(BIC_SMI, &bic_group_other); + SET_BIC(BIC_ThreadC, &bic_group_other); + SET_BIC(BIC_CoreTmp, &bic_group_other); + SET_BIC(BIC_IPC, &bic_group_other); + + BIC_INIT(&bic_group_disabled_by_default); + SET_BIC(BIC_USEC, &bic_group_disabled_by_default); + SET_BIC(BIC_TOD, &bic_group_disabled_by_default); + SET_BIC(BIC_cpuidle, &bic_group_disabled_by_default); + SET_BIC(BIC_APIC, &bic_group_disabled_by_default); + SET_BIC(BIC_X2APIC, &bic_group_disabled_by_default); + + BIC_INIT(&bic_enabled); + bic_set_all(&bic_enabled); + bic_clear_bits(&bic_enabled, &bic_group_disabled_by_default); + + BIC_INIT(&bic_present); + SET_BIC(BIC_USEC, &bic_present); + SET_BIC(BIC_TOD, &bic_present); + SET_BIC(BIC_cpuidle, &bic_present); + SET_BIC(BIC_APIC, &bic_present); + SET_BIC(BIC_X2APIC, &bic_present); + SET_BIC(BIC_pct_idle, &bic_present); +} /* * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: @@ -840,20 +989,21 @@ static const struct platform_features spr_features = { }; static const struct platform_features dmr_features = { - .has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control, - .has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt, - .has_nhm_msrs = spr_features.has_nhm_msrs, - .has_config_tdp = spr_features.has_config_tdp, - .bclk_freq = spr_features.bclk_freq, - .supported_cstates = spr_features.supported_cstates, - .cst_limit = spr_features.cst_limit, - .has_msr_core_c1_res = spr_features.has_msr_core_c1_res, - .has_msr_module_c6_res_ms = 1, /* DMR has Dual Core Module and MC6 MSR */ - .has_irtl_msrs = spr_features.has_irtl_msrs, - .has_cst_prewake_bit = spr_features.has_cst_prewake_bit, - .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit, - .trl_msrs = spr_features.trl_msrs, - .rapl_msrs = 0, /* DMR does not have RAPL MSRs */ + .has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control, + .has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt, + .has_nhm_msrs = spr_features.has_nhm_msrs, + .bclk_freq = spr_features.bclk_freq, + .supported_cstates = spr_features.supported_cstates, + .cst_limit = spr_features.cst_limit, + .has_msr_core_c1_res = spr_features.has_msr_core_c1_res, + .has_cst_prewake_bit = spr_features.has_cst_prewake_bit, + .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit, + .trl_msrs = spr_features.trl_msrs, + .has_msr_module_c6_res_ms = 1, /* DMR has Dual-Core-Module and MC6 MSR */ + .rapl_msrs = 0, /* DMR does not have RAPL MSRs */ + .plr_msrs = 0, /* DMR does not have PLR MSRs */ + .has_irtl_msrs = 0, /* DMR does not have IRTL MSRs */ + .has_config_tdp = 0, /* DMR does not have CTDP MSRs */ }; static const struct platform_features srf_features = { @@ -1204,7 +1354,7 @@ struct rapl_counter_arch_info { int msr_shift; /* Positive mean shift right, negative mean shift left */ double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ unsigned int rci_index; /* Maps data from perf counters to global variables */ - unsigned long long bic; + unsigned int bic_number; double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ unsigned long long flags; }; @@ -1219,7 +1369,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_energy_units, .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, - .bic = BIC_PkgWatt | BIC_Pkg_J, + .bic_number = BIC_PkgWatt, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic_number = BIC_Pkg_J, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1232,7 +1395,33 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_energy_units, .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, - .bic = BIC_PkgWatt | BIC_Pkg_J, + .bic_number = BIC_PkgWatt, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic_number = BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic_number = BIC_CorWatt, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1245,7 +1434,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_energy_units, .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, - .bic = BIC_CorWatt | BIC_Cor_J, + .bic_number = BIC_Cor_J, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1258,7 +1447,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_dram_energy_units, .rci_index = RAPL_RCI_INDEX_DRAM, - .bic = BIC_RAMWatt | BIC_RAM_J, + .bic_number = BIC_RAMWatt, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic_number = BIC_RAM_J, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1271,7 +1473,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_energy_units, .rci_index = RAPL_RCI_INDEX_GFX, - .bic = BIC_GFXWatt | BIC_GFX_J, + .bic_number = BIC_GFXWatt, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic_number = BIC_GFX_J, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1284,7 +1499,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_time_units, .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, - .bic = BIC_PKG__, + .bic_number = BIC_PKG__, .compat_scale = 100.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1297,7 +1512,7 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_time_units, .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, - .bic = BIC_RAM__, + .bic_number = BIC_RAM__, .compat_scale = 100.0, .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1310,7 +1525,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_energy_units, .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, - .bic = BIC_CorWatt | BIC_Cor_J, + .bic_number = BIC_CorWatt, + .compat_scale = 1.0, + .flags = 0, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic_number = BIC_Cor_J, .compat_scale = 1.0, .flags = 0, }, @@ -1323,7 +1551,20 @@ static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { .msr_shift = 0, .platform_rapl_msr_scale = &rapl_psys_energy_units, .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM, - .bic = BIC_SysWatt | BIC_Sys_J, + .bic_number = BIC_SysWatt, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PSYS, + .perf_subsys = "power", + .perf_name = "energy-psys", + .msr = MSR_PLATFORM_ENERGY_STATUS, + .msr_mask = 0x00000000FFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_psys_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PLATFORM, + .bic_number = BIC_Sys_J, .compat_scale = 1.0, .flags = RAPL_COUNTER_FLAG_PLATFORM_COUNTER | RAPL_COUNTER_FLAG_USE_MSR_SUM, }, @@ -1372,7 +1613,7 @@ struct cstate_counter_arch_info { const char *perf_name; unsigned long long msr; unsigned int rci_index; /* Maps data from perf counters to global variables */ - unsigned long long bic; + unsigned int bic_number; unsigned long long flags; int pkg_cstate_limit; }; @@ -1384,7 +1625,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c1-residency", .msr = MSR_CORE_C1_RES, .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, - .bic = BIC_CPU_c1, + .bic_number = BIC_CPU_c1, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, .pkg_cstate_limit = 0, }, @@ -1394,7 +1635,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c3-residency", .msr = MSR_CORE_C3_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, - .bic = BIC_CPU_c3, + .bic_number = BIC_CPU_c3, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, .pkg_cstate_limit = 0, }, @@ -1404,7 +1645,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c6-residency", .msr = MSR_CORE_C6_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, - .bic = BIC_CPU_c6, + .bic_number = BIC_CPU_c6, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, .pkg_cstate_limit = 0, }, @@ -1414,7 +1655,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c7-residency", .msr = MSR_CORE_C7_RESIDENCY, .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, - .bic = BIC_CPU_c7, + .bic_number = BIC_CPU_c7, .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, .pkg_cstate_limit = 0, }, @@ -1424,7 +1665,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c2-residency", .msr = MSR_PKG_C2_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, - .bic = BIC_Pkgpc2, + .bic_number = BIC_Pkgpc2, .flags = 0, .pkg_cstate_limit = PCL__2, }, @@ -1434,7 +1675,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c3-residency", .msr = MSR_PKG_C3_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, - .bic = BIC_Pkgpc3, + .bic_number = BIC_Pkgpc3, .flags = 0, .pkg_cstate_limit = PCL__3, }, @@ -1444,7 +1685,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c6-residency", .msr = MSR_PKG_C6_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, - .bic = BIC_Pkgpc6, + .bic_number = BIC_Pkgpc6, .flags = 0, .pkg_cstate_limit = PCL__6, }, @@ -1454,7 +1695,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c7-residency", .msr = MSR_PKG_C7_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, - .bic = BIC_Pkgpc7, + .bic_number = BIC_Pkgpc7, .flags = 0, .pkg_cstate_limit = PCL__7, }, @@ -1464,7 +1705,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c8-residency", .msr = MSR_PKG_C8_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, - .bic = BIC_Pkgpc8, + .bic_number = BIC_Pkgpc8, .flags = 0, .pkg_cstate_limit = PCL__8, }, @@ -1474,7 +1715,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c9-residency", .msr = MSR_PKG_C9_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, - .bic = BIC_Pkgpc9, + .bic_number = BIC_Pkgpc9, .flags = 0, .pkg_cstate_limit = PCL__9, }, @@ -1484,7 +1725,7 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { .perf_name = "c10-residency", .msr = MSR_PKG_C10_RESIDENCY, .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, - .bic = BIC_Pkgpc10, + .bic_number = BIC_Pkgpc10, .flags = 0, .pkg_cstate_limit = PCL_10, }, @@ -1840,8 +2081,6 @@ struct pkg_data { ((node_no) * topo.cores_per_node) + \ (core_no)) -#define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no) - /* * The accumulated sum of MSR is defined as a monotonic * increasing MSR, it will be accumulated periodically, @@ -2036,6 +2275,7 @@ struct platform_counters { struct cpu_topology { int physical_package_id; int die_id; + int l3_id; int logical_cpu_id; int physical_node_id; int logical_node_id; /* 0-based count within the package */ @@ -2057,6 +2297,7 @@ struct topo_params { int max_core_id; int max_package_id; int max_die_id; + int max_l3_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -2090,6 +2331,8 @@ int cpu_is_not_allowed(int cpu) * skip non-present cpus */ +#define PER_THREAD_PARAMS struct thread_data *t, struct core_data *c, struct pkg_data *p + int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *), struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) { @@ -2103,16 +2346,15 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t; struct core_data *c; - struct pkg_data *p; + t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_allowed(t->cpu_id)) continue; c = GET_CORE(core_base, core_no, node_no, pkg_no); - p = GET_PKG(pkg_base, pkg_no); - retval |= func(t, c, p); + retval |= func(t, c, &pkg_base[pkg_no]); } } } @@ -2120,21 +2362,21 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk return retval; } -int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int is_cpu_first_thread_in_core(PER_THREAD_PARAMS) { UNUSED(p); return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0); } -int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int is_cpu_first_core_in_package(PER_THREAD_PARAMS) { UNUSED(c); return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0); } -int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int is_cpu_first_thread_in_package(PER_THREAD_PARAMS) { return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p); } @@ -2179,10 +2421,13 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp | - BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; - - bic_enabled &= ~bic_msrs; + CLR_BIC(BIC_Mod_c6, &bic_enabled); + CLR_BIC(BIC_CoreTmp, &bic_enabled); + CLR_BIC(BIC_Totl_c0, &bic_enabled); + CLR_BIC(BIC_Any_c0, &bic_enabled); + CLR_BIC(BIC_GFX_c0, &bic_enabled); + CLR_BIC(BIC_CPUGFX, &bic_enabled); + CLR_BIC(BIC_PkgTmp, &bic_enabled); free_sys_msr_counters(); } @@ -2310,6 +2555,8 @@ char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; int deferred_add_index; int deferred_skip_index; +unsigned int deferred_add_consumed; +unsigned int deferred_skip_consumed; /* * HIDE_LIST - hide this list of counters, show the rest [default] @@ -2380,10 +2627,9 @@ void help(void) * for all the strings in comma separate name_list, * set the approprate bit in return value. */ -unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) +void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode) { unsigned int i; - unsigned long long retval = 0; while (name_list) { char *comma; @@ -2395,41 +2641,39 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) for (i = 0; i < MAX_BIC; ++i) { if (!strcmp(name_list, bic[i].name)) { - retval |= (1ULL << i); + SET_BIC(i, ret_set); break; } if (!strcmp(name_list, "all")) { - retval |= ~0; + bic_set_all(ret_set); break; } else if (!strcmp(name_list, "topology")) { - retval |= BIC_GROUP_TOPOLOGY; + CPU_OR(ret_set, ret_set, &bic_group_topology); break; } else if (!strcmp(name_list, "power")) { - retval |= BIC_GROUP_THERMAL_PWR; + CPU_OR(ret_set, ret_set, &bic_group_thermal_pwr); break; } else if (!strcmp(name_list, "idle")) { - retval |= BIC_GROUP_IDLE; + CPU_OR(ret_set, ret_set, &bic_group_idle); break; } else if (!strcmp(name_list, "swidle")) { - retval |= BIC_GROUP_SW_IDLE; + CPU_OR(ret_set, ret_set, &bic_group_sw_idle); break; } else if (!strcmp(name_list, "sysfs")) { /* legacy compatibility */ - retval |= BIC_GROUP_SW_IDLE; + CPU_OR(ret_set, ret_set, &bic_group_sw_idle); break; } else if (!strcmp(name_list, "hwidle")) { - retval |= BIC_GROUP_HW_IDLE; + CPU_OR(ret_set, ret_set, &bic_group_hw_idle); break; } else if (!strcmp(name_list, "frequency")) { - retval |= BIC_GROUP_FREQUENCY; + CPU_OR(ret_set, ret_set, &bic_group_frequency); break; } else if (!strcmp(name_list, "other")) { - retval |= BIC_OTHER; + CPU_OR(ret_set, ret_set, &bic_group_other); break; } - } if (i == MAX_BIC) { - fprintf(stderr, "deferred %s\n", name_list); if (mode == SHOW_LIST) { deferred_add_names[deferred_add_index++] = name_list; if (deferred_add_index >= MAX_DEFERRED) { @@ -2456,7 +2700,6 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) name_list++; } - return retval; } void print_header(char *delim) @@ -2474,6 +2717,8 @@ void print_header(char *delim) outp += sprintf(outp, "%sPackage", (printed++ ? delim : "")); if (DO_BIC(BIC_Die)) outp += sprintf(outp, "%sDie", (printed++ ? delim : "")); + if (DO_BIC(BIC_L3)) + outp += sprintf(outp, "%sL3", (printed++ ? delim : "")); if (DO_BIC(BIC_Node)) outp += sprintf(outp, "%sNode", (printed++ ? delim : "")); if (DO_BIC(BIC_Core)) @@ -2514,7 +2759,7 @@ void print_header(char *delim) for (mp = sys.tp; mp; mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name); else @@ -2589,7 +2834,7 @@ void print_header(char *delim) } for (mp = sys.cp; mp; mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); else @@ -2719,7 +2964,7 @@ void print_header(char *delim) outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : "")); for (mp = sys.pp; mp; mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); else if (mp->width == 32) @@ -2777,7 +3022,7 @@ void print_header(char *delim) outp += sprintf(outp, "\n"); } -int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int dump_counters(PER_THREAD_PARAMS) { int i; struct msr_counter *mp; @@ -2892,7 +3137,7 @@ double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desir /* * column formatting convention & formats */ -int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int format_counters(PER_THREAD_PARAMS) { static int count; @@ -2945,6 +3190,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s-", (printed++ ? delim : "")); if (DO_BIC(BIC_Die)) outp += sprintf(outp, "%s-", (printed++ ? delim : "")); + if (DO_BIC(BIC_L3)) + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); if (DO_BIC(BIC_Node)) outp += sprintf(outp, "%s-", (printed++ ? delim : "")); if (DO_BIC(BIC_Core)) @@ -2968,6 +3215,12 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data else outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } + if (DO_BIC(BIC_L3)) { + if (c) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].l3_id); + else + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); + } if (DO_BIC(BIC_Node)) { if (t) outp += sprintf(outp, "%s%d", @@ -3032,7 +3285,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data /* Added counters */ for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 32) outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]); @@ -3129,7 +3382,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 32) outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]); @@ -3328,7 +3581,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) { + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { if (mp->width == 32) outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]); @@ -3426,7 +3679,7 @@ void flush_output_stderr(void) outp = output_buffer; } -void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +void format_all_counters(PER_THREAD_PARAMS) { static int count; @@ -3505,7 +3758,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; else if (mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; @@ -3549,7 +3802,7 @@ void delta_core(struct core_data *new, struct core_data *old) DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; @@ -3663,7 +3916,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->smi_count = new->smi_count - old->smi_count; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) + if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; @@ -3717,7 +3970,7 @@ void rapl_counter_clear(struct rapl_counter *c) c->unit = RAPL_UNIT_INVALID; } -void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +void clear_counters(PER_THREAD_PARAMS) { int i; struct msr_counter *mp; @@ -3814,7 +4067,7 @@ void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter dst->raw_value += src->raw_value; } -int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int sum_counters(PER_THREAD_PARAMS) { int i; struct msr_counter *mp; @@ -3962,7 +4215,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) * sum the counters for all cpus in the system * compute the weighted average */ -void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p) +void compute_average(PER_THREAD_PARAMS) { int i; struct msr_counter *mp; @@ -4545,7 +4798,7 @@ char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) return NULL; } -int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) +int get_cstate_counters(unsigned int cpu, PER_THREAD_PARAMS) { /* * Overcommit memory a little bit here, @@ -4845,7 +5098,7 @@ static inline int get_rapl_domain_id(int cpu) * migrate to cpu * acquire and record local counters for that cpu */ -int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int get_counters(PER_THREAD_PARAMS) { int cpu = t->cpu_id; unsigned long long msr; @@ -5673,6 +5926,11 @@ int get_die_id(int cpu) return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu); } +int get_l3_id(int cpu) +{ + return parse_int_file("/sys/devices/system/cpu/cpu%d/cache/index3/id", cpu); +} + int get_core_id(int cpu) { return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); @@ -5861,7 +6119,6 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t, *t2; struct core_data *c, *c2; - struct pkg_data *p, *p2; t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); @@ -5873,10 +6130,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, c = GET_CORE(core_base, core_no, node_no, pkg_no); c2 = GET_CORE(core_base2, core_no, node_no, pkg_no); - p = GET_PKG(pkg_base, pkg_no); - p2 = GET_PKG(pkg_base2, pkg_no); - - retval |= func(t, c, p, t2, c2, p2); + retval |= func(t, c, &pkg_base[pkg_no], t2, c2, &pkg_base2[pkg_no]); } } } @@ -6334,7 +6588,7 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) timer_t timerid; /* Timer callback, update the sum of MSRs periodically. */ -static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p) +static int update_msr_sum(PER_THREAD_PARAMS) { int i, ret; int cpu = t->cpu_id; @@ -6572,8 +6826,16 @@ int check_for_cap_sys_rawio(void) int ret = 0; caps = cap_get_proc(); - if (caps == NULL) + if (caps == NULL) { + /* + * CONFIG_MULTIUSER=n kernels have no cap_get_proc() + * Allow them to continue and attempt to access MSRs + */ + if (errno == ENOSYS) + return 0; + return 1; + } if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { ret = 1; @@ -6740,7 +7002,8 @@ static void probe_intel_uncore_frequency_legacy(void) sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); - if (access(path_base, R_OK)) + sprintf(path, "%s/current_freq_khz", path_base); + if (access(path, R_OK)) continue; BIC_PRESENT(BIC_UNCORE_MHZ); @@ -7072,7 +7335,7 @@ static void dump_sysfs_pstate_config(void) * print_epb() * Decode the ENERGY_PERF_BIAS MSR */ -int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int print_epb(PER_THREAD_PARAMS) { char *epb_string; int cpu, epb; @@ -7121,7 +7384,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) * print_hwp() * Decode the MSR_HWP_CAPABILITIES */ -int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int print_hwp(PER_THREAD_PARAMS) { unsigned long long msr; int cpu; @@ -7210,7 +7473,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) /* * print_perf_limit() */ -int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int print_perf_limit(PER_THREAD_PARAMS) { unsigned long long msr; int cpu; @@ -7335,21 +7598,28 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; - const unsigned long long bic_watt_bits = BIC_SysWatt | BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; - const unsigned long long bic_joules_bits = BIC_Sys_J | BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) - bic_enabled &= ~bic_watt_bits; - else - bic_enabled &= ~bic_joules_bits; + if (rapl_joules) { + CLR_BIC(BIC_SysWatt, &bic_enabled); + CLR_BIC(BIC_PkgWatt, &bic_enabled); + CLR_BIC(BIC_CorWatt, &bic_enabled); + CLR_BIC(BIC_RAMWatt, &bic_enabled); + CLR_BIC(BIC_GFXWatt, &bic_enabled); + } else { + CLR_BIC(BIC_Sys_J, &bic_enabled); + CLR_BIC(BIC_Pkg_J, &bic_enabled); + CLR_BIC(BIC_Cor_J, &bic_enabled); + CLR_BIC(BIC_RAM_J, &bic_enabled); + CLR_BIC(BIC_GFX_J, &bic_enabled); + } if (!platform->rapl_msrs || no_msr) return; if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) - bic_enabled &= ~BIC_PKG__; + CLR_BIC(BIC_PKG__, &bic_enabled); if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) - bic_enabled &= ~BIC_RAM__; + CLR_BIC(BIC_RAM__, &bic_enabled); /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -7388,13 +7658,14 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; - const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; - const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) - bic_enabled &= ~bic_watt_bits; - else - bic_enabled &= ~bic_joules_bits; + if (rapl_joules) { + CLR_BIC(BIC_SysWatt, &bic_enabled); + CLR_BIC(BIC_CorWatt, &bic_enabled); + } else { + CLR_BIC(BIC_Pkg_J, &bic_enabled); + CLR_BIC(BIC_Cor_J, &bic_enabled); + } if (!platform->rapl_msrs || no_msr) return; @@ -7577,7 +7848,7 @@ static int print_rapl_sysfs(void) return 0; } -int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int print_rapl(PER_THREAD_PARAMS) { unsigned long long msr; const char *msr_name; @@ -7731,7 +8002,7 @@ void probe_rapl(void) * below this value, including the Digital Thermal Sensor (DTS), * Package Thermal Management Sensor (PTM), and thermal event thresholds. */ -int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int set_temperature_target(PER_THREAD_PARAMS) { unsigned long long msr; unsigned int tcc_default, tcc_offset; @@ -7799,7 +8070,7 @@ guess: return 0; } -int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int print_thermal(PER_THREAD_PARAMS) { unsigned long long msr; unsigned int dts, dts2; @@ -7879,7 +8150,7 @@ void probe_thermal(void) for_all_cpus(print_thermal, ODD_COUNTERS); } -int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int get_cpu_type(PER_THREAD_PARAMS) { unsigned int eax, ebx, ecx, edx; @@ -8141,7 +8412,7 @@ void rapl_perf_init(void) enum rapl_unit unit; unsigned int next_domain; - if (!BIC_IS_ENABLED(cai->bic)) + if (!BIC_IS_ENABLED(cai->bic_number)) continue; memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); @@ -8205,7 +8476,7 @@ void rapl_perf_init(void) /* If any CPU has access to the counter, make it present */ if (has_counter) - BIC_PRESENT(cai->bic); + BIC_PRESENT(cai->bic_number); } free(domain_visited); @@ -8426,7 +8697,7 @@ void cstate_perf_init_(bool soft_c1) if (!per_core && pkg_visited[pkg_id]) continue; - const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); const bool counter_supported = (platform->supported_cstates & cai->feature_mask); @@ -8453,7 +8724,7 @@ void cstate_perf_init_(bool soft_c1) /* If any CPU has access to the counter, make it present */ if (has_counter) - BIC_PRESENT(cai->bic); + BIC_PRESENT(cai->bic_number); } free(cores_visited); @@ -8949,6 +9220,11 @@ void topology_probe(bool startup) if (cpus[i].die_id > topo.max_die_id) topo.max_die_id = cpus[i].die_id; + /* get l3 information */ + cpus[i].l3_id = get_l3_id(i); + if (cpus[i].l3_id > topo.max_l3_id) + topo.max_l3_id = cpus[i].l3_id; + /* get numa node information */ cpus[i].physical_node_id = get_physical_node_id(&cpus[i]); if (cpus[i].physical_node_id > topo.max_node_num) @@ -8981,6 +9257,9 @@ void topology_probe(bool startup) if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); + if (!summary_only && topo.max_l3_id > 0) + BIC_PRESENT(BIC_L3); + topo.num_packages = max_package_id + 1; if (debug > 1) fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages); @@ -9004,8 +9283,8 @@ void topology_probe(bool startup) if (cpu_is_not_present(i)) continue; fprintf(outf, - "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n", - i, cpus[i].physical_package_id, cpus[i].die_id, + "cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n", + i, cpus[i].physical_package_id, cpus[i].die_id, cpus[i].l3_id, cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id); } @@ -9060,7 +9339,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, int thread_id = cpus[cpu_id].thread_id; struct thread_data *t; struct core_data *c; - struct pkg_data *p; /* Workaround for systems where physical_node_id==-1 * and logical_node_id==(-1 - topo.num_cpus) @@ -9070,18 +9348,17 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id); c = GET_CORE(core_base, core_id, node_id, pkg_id); - p = GET_PKG(pkg_base, pkg_id); t->cpu_id = cpu_id; if (!cpu_is_not_allowed(cpu_id)) { if (c->base_cpu < 0) c->base_cpu = t->cpu_id; - if (p->base_cpu < 0) - p->base_cpu = t->cpu_id; + if (pkg_base[pkg_id].base_cpu < 0) + pkg_base[pkg_id].base_cpu = t->cpu_id; } c->core_id = core_id; - p->package_id = pkg_id; + pkg_base[pkg_id].package_id = pkg_id; } int initialize_counters(int cpu_id) @@ -9121,7 +9398,7 @@ void allocate_irq_buffers(void) err(-1, "calloc %d NMI", topo.max_cpu_num + 1); } -int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int update_topo(PER_THREAD_PARAMS) { topo.allowed_cpus++; if ((int)t->cpu_id == c->base_cpu) @@ -9189,7 +9466,7 @@ void check_msr_access(void) void check_perf_access(void) { if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) - bic_enabled &= ~BIC_IPC; + CLR_BIC(BIC_IPC, &bic_enabled); } bool perf_has_hybrid_devices(void) @@ -9758,8 +10035,8 @@ void turbostat_init() * disable more BICs, since it can't be reported accurately. */ if (platform->enable_tsc_tweak && !has_base_hz) { - bic_enabled &= ~BIC_Busy; - bic_enabled &= ~BIC_Bzy_MHz; + CLR_BIC(BIC_Busy, &bic_enabled); + CLR_BIC(BIC_Bzy_MHz, &bic_enabled); } } @@ -9817,6 +10094,7 @@ int fork_it(char **argv) timersub(&tv_odd, &tv_even, &tv_delta); if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) fprintf(outf, "%s: Counter reset detected\n", progname); + delta_platform(&platform_counters_odd, &platform_counters_even); compute_average(EVEN_COUNTERS); format_all_counters(EVEN_COUNTERS); @@ -9848,7 +10126,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.06.08 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2025.09.09 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -10145,6 +10423,10 @@ void parse_add_command_msr(char *add_command) format = FORMAT_RAW; goto next; } + if (!strncmp(add_command, "average", strlen("average"))) { + format = FORMAT_AVERAGE; + goto next; + } if (!strncmp(add_command, "delta", strlen("delta"))) { format = FORMAT_DELTA; goto next; @@ -10417,13 +10699,19 @@ next: has_format = true; } + if (strcmp("average", format_name) == 0) { + format = FORMAT_AVERAGE; + has_format = true; + } + if (strcmp("delta", format_name) == 0) { format = FORMAT_DELTA; has_format = true; } if (!has_format) { - fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name); + fprintf(stderr, "%s: Invalid format %s. Expected raw, average or delta\n", + __func__, format_name); exit(1); } } @@ -10513,8 +10801,10 @@ int is_deferred_add(char *name) int i; for (i = 0; i < deferred_add_index; ++i) - if (!strcmp(name, deferred_add_names[i])) + if (!strcmp(name, deferred_add_names[i])) { + deferred_add_consumed |= (1 << i); return 1; + } return 0; } @@ -10523,11 +10813,34 @@ int is_deferred_skip(char *name) int i; for (i = 0; i < deferred_skip_index; ++i) - if (!strcmp(name, deferred_skip_names[i])) + if (!strcmp(name, deferred_skip_names[i])) { + deferred_skip_consumed |= (1 << i); return 1; + } return 0; } +void verify_deferred_consumed(void) +{ + int i; + int fail = 0; + + for (i = 0; i < deferred_add_index; ++i) { + if (!(deferred_add_consumed & (1 << i))) { + warnx("Counter '%s' can not be added.", deferred_add_names[i]); + fail++; + } + } + for (i = 0; i < deferred_skip_index; ++i) { + if (!(deferred_skip_consumed & (1 << i))) { + warnx("Counter '%s' can not be skipped.", deferred_skip_names[i]); + fail++; + } + } + if (fail) + exit(-EINVAL); +} + void probe_cpuidle_residency(void) { char path[64]; @@ -10537,9 +10850,6 @@ void probe_cpuidle_residency(void) int min_state = 1024, max_state = 0; char *sp; - if (!DO_BIC(BIC_pct_idle)) - return; - for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -10752,22 +11062,29 @@ void cmdline(int argc, char **argv) no_perf = 1; break; case 'e': - /* --enable specified counter */ - bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST); + /* --enable specified counter, without clearning existing list */ + bic_lookup(&bic_enabled, optarg, SHOW_LIST); break; case 'f': force_load++; break; case 'd': debug++; - ENABLE_BIC(BIC_DISABLED_BY_DEFAULT); + bic_set_all(&bic_enabled); break; case 'H': /* * --hide: do not show those specified * multiple invocations simply clear more bits in enabled mask */ - bic_enabled &= ~bic_lookup(optarg, HIDE_LIST); + { + cpu_set_t bic_group_hide; + + BIC_INIT(&bic_group_hide); + + bic_lookup(&bic_group_hide, optarg, HIDE_LIST); + bic_clear_bits(&bic_enabled, &bic_group_hide); + } break; case 'h': default: @@ -10791,7 +11108,7 @@ void cmdline(int argc, char **argv) rapl_joules++; break; case 'l': - ENABLE_BIC(BIC_DISABLED_BY_DEFAULT); + bic_set_all(&bic_enabled); list_header_only++; quiet++; break; @@ -10828,9 +11145,8 @@ void cmdline(int argc, char **argv) * subsequent invocations can add to it. */ if (shown == 0) - bic_enabled = bic_lookup(optarg, SHOW_LIST); - else - bic_enabled |= bic_lookup(optarg, SHOW_LIST); + BIC_INIT(&bic_enabled); + bic_lookup(&bic_enabled, optarg, SHOW_LIST); shown = 1; break; case 'S': @@ -10867,6 +11183,8 @@ int main(int argc, char **argv) { int fd, ret; + bic_groups_init(); + fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY); if (fd < 0) goto skip_cgroup_setting; @@ -10889,6 +11207,8 @@ skip_cgroup_setting: probe_cpuidle_residency(); probe_cpuidle_counts(); + verify_deferred_consumed(); + if (!getuid()) set_rlimit(); diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 530752ddde8e..c1cfd297aabf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -229,7 +229,8 @@ #if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) || \ + (defined(__TARGET_ARCH_powerpc)) #define CAN_USE_LOAD_ACQ_STORE_REL #endif diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..c952640f163b 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) cs->nr_evicted, cs->nr_recently_evicted); } +enum file_type { + FILE_MMAP, + FILE_SHMEM +}; + bool write_exactly(int fd, size_t filesize) { int random_fd = open("/dev/urandom", O_RDONLY); @@ -201,8 +206,20 @@ out1: out: return ret; } +const char *file_type_str(enum file_type type) +{ + switch (type) { + case FILE_SHMEM: + return "shmem"; + case FILE_MMAP: + return "mmap"; + default: + return "unknown"; + } +} -bool test_cachestat_shmem(void) + +bool run_cachestat_test(enum file_type type) { size_t PS = sysconf(_SC_PAGESIZE); size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ @@ -212,27 +229,50 @@ bool test_cachestat_shmem(void) char *filename = "tmpshmcstat"; struct cachestat cs; bool ret = true; + int fd; unsigned long num_pages = compute_len / PS; - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + if (type == FILE_SHMEM) + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + else + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd < 0) { - ksft_print_msg("Unable to create shmem file.\n"); + ksft_print_msg("Unable to create %s file.\n", + file_type_str(type)); ret = false; goto out; } if (ftruncate(fd, filesize)) { - ksft_print_msg("Unable to truncate shmem file.\n"); + ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type)); ret = false; goto close_fd; } + switch (type) { + case FILE_SHMEM: + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to file.\n"); + ret = false; + goto close_fd; + } + break; + case FILE_MMAP: + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); - if (!write_exactly(fd, filesize)) { - ksft_print_msg("Unable to write to shmem file.\n"); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + for (int i = 0; i < filesize; i++) + map[i] = 'A'; + break; + default: + ksft_print_msg("Unsupported file type.\n"); ret = false; goto close_fd; } - syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); if (syscall_ret) { @@ -308,12 +348,18 @@ int main(void) break; } - if (test_cachestat_shmem()) + if (run_cachestat_test(FILE_SHMEM)) ksft_test_result_pass("cachestat works with a shmem file\n"); else { ksft_test_result_fail("cachestat fails with a shmem file\n"); ret = 1; } + if (run_cachestat_test(FILE_MMAP)) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3556f3563e08..984ece05f7f9 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -11,6 +11,7 @@ TEST_GEN_FILES := \ TEST_PROGS := \ napi_id.py \ + napi_threaded.py \ netcons_basic.sh \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py new file mode 100755 index 000000000000..b2698db39817 --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test napi threaded states. +""" + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_ne, ksft_ge +from lib.py import NetDrvEnv, NetdevFamily +from lib.py import cmd, defer, ethtool + + +def _assert_napi_threaded_enabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'enabled') + ksft_ne(napi.get('pid'), None) + + +def _assert_napi_threaded_disabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'disabled') + ksft_eq(napi.get('pid'), None) + + +def _set_threaded_state(cfg, threaded) -> None: + cmd(f"echo {threaded} > /sys/class/net/{cfg.ifname}/threaded") + + +def _setup_deferred_cleanup(cfg) -> None: + combined = ethtool(f"-l {cfg.ifname}", json=True)[0].get("combined", 0) + ksft_ge(combined, 2) + defer(ethtool, f"-L {cfg.ifname} combined {combined}") + + threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout + defer(_set_threaded_state, cfg, threaded) + + +def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level and + then disabled at napi level for one napi, the threaded state + of all napis is preserved after a change in number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + # disable threaded for napi1 + nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_disabled(nl, napi1_id) + + +def change_num_queues(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level, + the napi threaded state is preserved after a change in + number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEnv(__file__, queue_count=2) as cfg: + ksft_run([change_num_queues, + enable_dev_threaded_disable_napi_threaded], + args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh index e8e0dc088d6a..01d0c044a5fc 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -1053,6 +1053,6 @@ trap cleanup EXIT setup_prepare -tests_run +xfail_on_slow tests_run exit $EXIT_STATUS diff --git a/tools/testing/selftests/kho/arm64.conf b/tools/testing/selftests/kho/arm64.conf new file mode 100644 index 000000000000..ee696807cd35 --- /dev/null +++ b/tools/testing/selftests/kho/arm64.conf @@ -0,0 +1,9 @@ +QEMU_CMD="qemu-system-aarch64 -M virt -cpu max" +QEMU_KCONFIG=" +CONFIG_SERIAL_AMBA_PL010=y +CONFIG_SERIAL_AMBA_PL010_CONSOLE=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +" +KERNEL_IMAGE="Image" +KERNEL_CMDLINE="console=ttyAMA0" diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c new file mode 100644 index 000000000000..8034e24c6bf6 --- /dev/null +++ b/tools/testing/selftests/kho/init.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef NOLIBC +#include <errno.h> +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <syscall.h> +#include <sys/mount.h> +#include <sys/reboot.h> +#endif + +/* from arch/x86/include/asm/setup.h */ +#define COMMAND_LINE_SIZE 2048 + +/* from include/linux/kexex.h */ +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + +#define KHO_FINILIZE "/debugfs/kho/out/finalize" +#define KERNEL_IMAGE "/kernel" + +static int mount_filesystems(void) +{ + if (mount("debugfs", "/debugfs", "debugfs", 0, NULL) < 0) + return -1; + + return mount("proc", "/proc", "proc", 0, NULL); +} + +static int kho_enable(void) +{ + const char enable[] = "1"; + int fd; + + fd = open(KHO_FINILIZE, O_RDWR); + if (fd < 0) + return -1; + + if (write(fd, enable, sizeof(enable)) != sizeof(enable)) + return 1; + + close(fd); + return 0; +} + +static long kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, const char *cmdline, + unsigned long flags) +{ + return syscall(__NR_kexec_file_load, kernel_fd, initrd_fd, cmdline_len, + cmdline, flags); +} + +static int kexec_load(void) +{ + char cmdline[COMMAND_LINE_SIZE]; + ssize_t len; + int fd, err; + + fd = open("/proc/cmdline", O_RDONLY); + if (fd < 0) + return -1; + + len = read(fd, cmdline, sizeof(cmdline)); + close(fd); + if (len < 0) + return -1; + + /* replace \n with \0 */ + cmdline[len - 1] = 0; + fd = open(KERNEL_IMAGE, O_RDONLY); + if (fd < 0) + return -1; + + err = kexec_file_load(fd, -1, len, cmdline, KEXEC_FILE_NO_INITRAMFS); + close(fd); + + return err ? : 0; +} + +int main(int argc, char *argv[]) +{ + if (mount_filesystems()) + goto err_reboot; + + if (kho_enable()) + goto err_reboot; + + if (kexec_load()) + goto err_reboot; + + if (reboot(RB_KEXEC)) + goto err_reboot; + + return 0; + +err_reboot: + reboot(RB_AUTOBOOT); + return -1; +} diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh new file mode 100755 index 000000000000..ec70a17bd476 --- /dev/null +++ b/tools/testing/selftests/kho/vmtest.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -ue + +CROSS_COMPILE="${CROSS_COMPILE:-""}" + +test_dir=$(realpath "$(dirname "$0")") +kernel_dir=$(realpath "$test_dir/../../../..") + +tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX) +headers_dir="$tmp_dir/usr" +initrd_dir="$tmp_dir/initrd" +initrd="$tmp_dir/initrd.cpio" + +source "$test_dir/../kselftest/ktap_helpers.sh" + +function usage() { + cat <<EOF +$0 [-d build_dir] [-j jobs] [-t target_arch] [-h] +Options: + -d) path to the kernel build directory + -j) number of jobs for compilation, similar to -j in make + -t) run test for target_arch, requires CROSS_COMPILE set + supported targets: aarch64, x86_64 + -h) display this help +EOF +} + +function cleanup() { + rm -fr "$tmp_dir" + ktap_finished +} +trap cleanup EXIT + +function skip() { + local msg=${1:-""} + + ktap_test_skip "$msg" + exit "$KSFT_SKIP" +} + +function fail() { + local msg=${1:-""} + + ktap_test_fail "$msg" + exit "$KSFT_FAIL" +} + +function build_kernel() { + local build_dir=$1 + local make_cmd=$2 + local arch_kconfig=$3 + local kimage=$4 + + local kho_config="$tmp_dir/kho.config" + local kconfig="$build_dir/.config" + + # enable initrd, KHO and KHO test in kernel configuration + tee "$kconfig" > "$kho_config" <<EOF +CONFIG_BLK_DEV_INITRD=y +CONFIG_KEXEC_HANDOVER=y +CONFIG_TEST_KEXEC_HANDOVER=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_VM=y +$arch_kconfig +EOF + + make_cmd="$make_cmd -C $kernel_dir O=$build_dir" + $make_cmd olddefconfig + + # verify that kernel confiration has all necessary options + while read -r opt ; do + grep "$opt" "$kconfig" &>/dev/null || skip "$opt is missing" + done < "$kho_config" + + $make_cmd "$kimage" + $make_cmd headers_install INSTALL_HDR_PATH="$headers_dir" +} + +function mkinitrd() { + local kernel=$1 + + mkdir -p "$initrd_dir"/{dev,debugfs,proc} + sudo mknod "$initrd_dir/dev/console" c 5 1 + + "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \ + -fno-asynchronous-unwind-tables -fno-ident -nostdlib \ + -include "$test_dir/../../../include/nolibc/nolibc.h" \ + -o "$initrd_dir/init" "$test_dir/init.c" \ + + cp "$kernel" "$initrd_dir/kernel" + + pushd "$initrd_dir" &>/dev/null + find . | cpio -H newc --create > "$initrd" 2>/dev/null + popd &>/dev/null +} + +function run_qemu() { + local qemu_cmd=$1 + local cmdline=$2 + local kernel=$3 + local serial="$tmp_dir/qemu.serial" + + cmdline="$cmdline kho=on panic=-1" + + $qemu_cmd -m 1G -smp 2 -no-reboot -nographic -nodefaults \ + -accel kvm -accel hvf -accel tcg \ + -serial file:"$serial" \ + -append "$cmdline" \ + -kernel "$kernel" \ + -initrd "$initrd" + + grep "KHO restore succeeded" "$serial" &> /dev/null || fail "KHO failed" +} + +function target_to_arch() { + local target=$1 + + case $target in + aarch64) echo "arm64" ;; + x86_64) echo "x86" ;; + *) skip "architecture $target is not supported" + esac +} + +function main() { + local build_dir="$kernel_dir/.kho" + local jobs=$(($(nproc) * 2)) + local target="$(uname -m)" + + # skip the test if any of the preparation steps fails + set -o errtrace + trap skip ERR + + while getopts 'hd:j:t:' opt; do + case $opt in + d) + build_dir="$OPTARG" + ;; + j) + jobs="$OPTARG" + ;; + t) + target="$OPTARG" + ;; + h) + usage + exit 0 + ;; + *) + echo Unknown argument "$opt" + usage + exit 1 + ;; + esac + done + + ktap_print_header + ktap_set_plan 1 + + if [[ "$target" != "$(uname -m)" ]] && [[ -z "$CROSS_COMPILE" ]]; then + skip "Cross-platform testing needs to specify CROSS_COMPILE" + fi + + mkdir -p "$build_dir" + local arch=$(target_to_arch "$target") + source "$test_dir/$arch.conf" + + # build the kernel and create initrd + # initrd includes the kernel image that will be kexec'ed + local make_cmd="make ARCH=$arch CROSS_COMPILE=$CROSS_COMPILE -j$jobs" + build_kernel "$build_dir" "$make_cmd" "$QEMU_KCONFIG" "$KERNEL_IMAGE" + + local kernel="$build_dir/arch/$arch/boot/$KERNEL_IMAGE" + mkinitrd "$kernel" + + run_qemu "$QEMU_CMD" "$KERNEL_CMDLINE" "$kernel" + + ktap_test_pass "KHO succeeded" +} + +main "$@" diff --git a/tools/testing/selftests/kho/x86.conf b/tools/testing/selftests/kho/x86.conf new file mode 100644 index 000000000000..b419e610ca22 --- /dev/null +++ b/tools/testing/selftests/kho/x86.conf @@ -0,0 +1,7 @@ +QEMU_CMD=qemu-system-x86_64 +QEMU_KCONFIG=" +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +" +KERNEL_IMAGE="bzImage" +KERNEL_CMDLINE="console=ttyS0" diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..471cae8427f1 --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include <errno.h> +#include <setjmp.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <sched.h> +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int remote_pidfd; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->remote_pidfd = -1; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } + + if (self->remote_pidfd >= 0) + close(self->remote_pidfd); +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = read_pmd_pagesize(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, + 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + return; + } + + ASSERT_EQ(ret, huge_page_size); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, + 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test that process_madvise() rejects vlen > UIO_MAXIOV. + * The kernel should return -EINVAL when the number of iovecs exceeds 1024. + */ +TEST_F(process_madvise, invalid_vlen) +{ + const unsigned long pagesize = self->page_size; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge if [ -x ./memfd_secret ] diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index c5b01e1bd4c7..a7e790af38ff 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -35,24 +35,7 @@ failfunc=ktap_test_fail if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then optargs+=('--tolerance_usecs=14000') - - # xfail tests that are known flaky with dbg config, not fixable. - # still run them for coverage (and expect 100% pass without dbg). - declare -ar xfail_list=( - "tcp_blocking_blocking-connect.pkt" - "tcp_blocking_blocking-read.pkt" - "tcp_eor_no-coalesce-retrans.pkt" - "tcp_fast_recovery_prr-ss.*.pkt" - "tcp_sack_sack-route-refresh-ip-tos.pkt" - "tcp_slow_start_slow-start-after-win-update.pkt" - "tcp_timestamping.*.pkt" - "tcp_user_timeout_user-timeout-probe.pkt" - "tcp_zerocopy_cl.*.pkt" - "tcp_zerocopy_epoll_.*.pkt" - "tcp_tcp_info_tcp-info-.*-limited.pkt" - ) - readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$" - [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail + failfunc=ktap_test_xfail fi ktap_print_header diff --git a/tools/testing/selftests/net/test_neigh.sh b/tools/testing/selftests/net/test_neigh.sh index 388056472b5b..7c594bf6ead0 100755 --- a/tools/testing/selftests/net/test_neigh.sh +++ b/tools/testing/selftests/net/test_neigh.sh @@ -289,11 +289,11 @@ extern_valid_common() orig_base_reachable=$(ip -j ntable show name "$tbl_name" | jq '.[] | select(has("thresh1")) | .["base_reachable"]') run_cmd "ip ntable change name $tbl_name thresh1 10 base_reachable 10000" orig_gc_stale=$(ip -n "$ns1" -j ntable show name "$tbl_name" dev veth0 | jq '.[]["gc_stale"]') - run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 5000" - # Wait orig_base_reachable/2 for the new interval to take effect. - run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))" + run_cmd "ip -n $ns1 ntable change name $tbl_name dev veth0 gc_stale 1000" run_cmd "ip -n $ns1 neigh add $ip_addr lladdr $mac nud stale dev veth0 extern_valid" run_cmd "ip -n $ns1 neigh add ${subnet}3 lladdr $mac nud stale dev veth0" + # Wait orig_base_reachable/2 for the new interval to take effect. + run_cmd "sleep $(((orig_base_reachable / 1000) / 2 + 2))" for i in {1..20}; do run_cmd "ip -n $ns1 neigh add ${subnet}$((i + 4)) nud none dev veth0" done diff --git a/tools/testing/selftests/net/vlan_hw_filter.sh b/tools/testing/selftests/net/vlan_hw_filter.sh index 0fb56baf28e4..e195d5cab6f7 100755 --- a/tools/testing/selftests/net/vlan_hw_filter.sh +++ b/tools/testing/selftests/net/vlan_hw_filter.sh @@ -55,10 +55,10 @@ test_vlan0_del_crash_01() { ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on - ip netns exec ${NETNS} ifconfig bond0 down - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 down + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } @@ -68,11 +68,11 @@ test_vlan0_del_crash_02() { setup ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q - ip netns exec ${NETNS} ifconfig bond0 down - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 down + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } @@ -84,9 +84,9 @@ test_vlan0_del_crash_03() { ip netns exec ${NETNS} ip link add bond0 type bond mode 0 ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off - ip netns exec ${NETNS} ifconfig bond0 up + ip netns exec ${NETNS} ip link set dev bond0 up ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on - ip netns exec ${NETNS} ifconfig bond0 down + ip netns exec ${NETNS} ip link set dev bond0 down ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function" cleanup } diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore index ee93dc4969b8..4931b3b6bbd3 100644 --- a/tools/testing/selftests/perf_events/.gitignore +++ b/tools/testing/selftests/perf_events/.gitignore @@ -2,3 +2,4 @@ sigtrap_threads remove_on_exec watermark_signal +mmap diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile index 70e3ff211278..2e5d85770dfe 100644 --- a/tools/testing/selftests/perf_events/Makefile +++ b/tools/testing/selftests/perf_events/Makefile @@ -2,5 +2,5 @@ CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) LDFLAGS += -lpthread -TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal +TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal mmap include ../lib.mk diff --git a/tools/testing/selftests/perf_events/mmap.c b/tools/testing/selftests/perf_events/mmap.c new file mode 100644 index 000000000000..ea0427aac1f9 --- /dev/null +++ b/tools/testing/selftests/perf_events/mmap.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE + +#include <dirent.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <unistd.h> + +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/types.h> + +#include <linux/perf_event.h> + +#include "../kselftest_harness.h" + +#define RB_SIZE 0x3000 +#define AUX_SIZE 0x10000 +#define AUX_OFFS 0x4000 + +#define HOLE_SIZE 0x1000 + +/* Reserve space for rb, aux with space for shrink-beyond-vma testing. */ +#define REGION_SIZE (2 * RB_SIZE + 2 * AUX_SIZE) +#define REGION_AUX_OFFS (2 * RB_SIZE) + +#define MAP_BASE 1 +#define MAP_AUX 2 + +#define EVENT_SRC_DIR "/sys/bus/event_source/devices" + +FIXTURE(perf_mmap) +{ + int fd; + void *ptr; + void *region; +}; + +FIXTURE_VARIANT(perf_mmap) +{ + bool aux; + unsigned long ptr_size; +}; + +FIXTURE_VARIANT_ADD(perf_mmap, rb) +{ + .aux = false, + .ptr_size = RB_SIZE, +}; + +FIXTURE_VARIANT_ADD(perf_mmap, aux) +{ + .aux = true, + .ptr_size = AUX_SIZE, +}; + +static bool read_event_type(struct dirent *dent, __u32 *type) +{ + char typefn[512]; + FILE *fp; + int res; + + snprintf(typefn, sizeof(typefn), "%s/%s/type", EVENT_SRC_DIR, dent->d_name); + fp = fopen(typefn, "r"); + if (!fp) + return false; + + res = fscanf(fp, "%u", type); + fclose(fp); + return res > 0; +} + +FIXTURE_SETUP(perf_mmap) +{ + struct perf_event_attr attr = { + .size = sizeof(attr), + .disabled = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + }; + struct perf_event_attr attr_ok = {}; + unsigned int eacces = 0, map = 0; + struct perf_event_mmap_page *rb; + struct dirent *dent; + void *aux, *region; + DIR *dir; + + self->ptr = NULL; + + dir = opendir(EVENT_SRC_DIR); + if (!dir) + SKIP(return, "perf not available."); + + region = mmap(NULL, REGION_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(region, MAP_FAILED); + self->region = region; + + // Try to find a suitable event on this system + while ((dent = readdir(dir))) { + int fd; + + if (!read_event_type(dent, &attr.type)) + continue; + + fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0); + if (fd < 0) { + if (errno == EACCES) + eacces++; + continue; + } + + // Check whether the event supports mmap() + rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (rb == MAP_FAILED) { + close(fd); + continue; + } + + if (!map) { + // Save the event in case that no AUX capable event is found + attr_ok = attr; + map = MAP_BASE; + } + + if (!variant->aux) + continue; + + rb->aux_offset = AUX_OFFS; + rb->aux_size = AUX_SIZE; + + // Check whether it supports a AUX buffer + aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, AUX_OFFS); + if (aux == MAP_FAILED) { + munmap(rb, RB_SIZE); + close(fd); + continue; + } + + attr_ok = attr; + map = MAP_AUX; + munmap(aux, AUX_SIZE); + munmap(rb, RB_SIZE); + close(fd); + break; + } + closedir(dir); + + if (!map) { + if (!eacces) + SKIP(return, "No mappable perf event found."); + else + SKIP(return, "No permissions for perf_event_open()"); + } + + self->fd = syscall(SYS_perf_event_open, &attr_ok, 0, -1, -1, 0); + ASSERT_NE(self->fd, -1); + + rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, self->fd, 0); + ASSERT_NE(rb, MAP_FAILED); + + if (!variant->aux) { + self->ptr = rb; + return; + } + + if (map != MAP_AUX) + SKIP(return, "No AUX event found."); + + rb->aux_offset = AUX_OFFS; + rb->aux_size = AUX_SIZE; + aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, self->fd, AUX_OFFS); + ASSERT_NE(aux, MAP_FAILED); + self->ptr = aux; +} + +FIXTURE_TEARDOWN(perf_mmap) +{ + ASSERT_EQ(munmap(self->region, REGION_SIZE), 0); + if (self->fd != -1) + ASSERT_EQ(close(self->fd), 0); +} + +TEST_F(perf_mmap, remap) +{ + void *tmp, *ptr = self->ptr; + unsigned long size = variant->ptr_size; + + // Test the invalid remaps + ASSERT_EQ(mremap(ptr, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED); + ASSERT_EQ(mremap(ptr + HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED); + ASSERT_EQ(mremap(ptr + size - HOLE_SIZE, HOLE_SIZE, size, MREMAP_MAYMOVE), MAP_FAILED); + // Shrink the end of the mapping such that we only unmap past end of the VMA, + // which should succeed and poke a hole into the PROT_NONE region + ASSERT_NE(mremap(ptr + size - HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED); + + // Remap the whole buffer to a new address + tmp = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(tmp, MAP_FAILED); + + // Try splitting offset 1 hole size into VMA, this should fail + ASSERT_EQ(mremap(ptr + HOLE_SIZE, size - HOLE_SIZE, size - HOLE_SIZE, + MREMAP_MAYMOVE | MREMAP_FIXED, tmp), MAP_FAILED); + // Remapping the whole thing should succeed fine + ptr = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tmp); + ASSERT_EQ(ptr, tmp); + ASSERT_EQ(munmap(tmp, size), 0); +} + +TEST_F(perf_mmap, unmap) +{ + unsigned long size = variant->ptr_size; + + // Try to poke holes into the mappings + ASSERT_NE(munmap(self->ptr, HOLE_SIZE), 0); + ASSERT_NE(munmap(self->ptr + HOLE_SIZE, HOLE_SIZE), 0); + ASSERT_NE(munmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE), 0); +} + +TEST_F(perf_mmap, map) +{ + unsigned long size = variant->ptr_size; + + // Try to poke holes into the mappings by mapping anonymous memory over it + ASSERT_EQ(mmap(self->ptr, HOLE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED); + ASSERT_EQ(mmap(self->ptr + HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED); + ASSERT_EQ(mmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index b7dde152e75a..f6be8efd57ea 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -3,3 +3,4 @@ get_syscall_info get_set_sud peeksiginfo vmaccess +set_syscall_info diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index a40097232967..ba58589a1145 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -32,12 +32,12 @@ void workload_hint_exit(int signum) fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload type feature enable file\n"); + perror("Unable to open workload type feature enable file"); exit(1); } if (write(fd, "0\n", 2) < 0) { - perror("Can't disable workload hints\n"); + perror("Can't disable workload hints"); exit(1); } @@ -68,16 +68,14 @@ int main(int argc, char **argv) exit(1); sprintf(delay_str, "%s\n", argv[1]); - - sprintf(delay_str, "%s\n", argv[1]); fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload notification delay\n"); + perror("Unable to open workload notification delay"); exit(1); } if (write(fd, delay_str, strlen(delay_str)) < 0) { - perror("Can't set delay\n"); + perror("Can't set delay"); exit(1); } @@ -94,12 +92,12 @@ int main(int argc, char **argv) /* Enable feature via sysfs knob */ fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); if (fd < 0) { - perror("Unable to open workload type feature enable file\n"); + perror("Unable to open workload type feature enable file"); exit(1); } if (write(fd, "1\n", 2) < 0) { - perror("Can't enable workload hints\n"); + perror("Can't enable workload hints"); exit(1); } @@ -110,7 +108,7 @@ int main(int argc, char **argv) while (1) { fd = open(WORKLOAD_TYPE_INDEX_ATTRIBUTE, O_RDONLY); if (fd < 0) { - perror("Unable to open workload type file\n"); + perror("Unable to open workload type file"); exit(1); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index a838c37f93e5..3639aa8dd2b0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr; #define CAP_IPC_LOCK 14 #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif #define FIRST_USER_ADDRESS 0UL |