summaryrefslogtreecommitdiff
path: root/tools/mm
diff options
context:
space:
mode:
Diffstat (limited to 'tools/mm')
-rw-r--r--tools/mm/Makefile15
-rw-r--r--tools/mm/page-types.c26
-rw-r--r--tools/mm/page_owner_sort.c230
-rw-r--r--tools/mm/show_page_info.py169
-rw-r--r--tools/mm/slabinfo.c21
-rw-r--r--tools/mm/thp_swap_allocator_test.c234
-rw-r--r--tools/mm/thpmaps675
7 files changed, 1223 insertions, 147 deletions
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 6c1da51f4177..f5725b5c23aa 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,17 +3,18 @@
#
include ../scripts/Makefile.include
-TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort thp_swap_allocator_test
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
LIB_DIR = ../lib/api
LIBS = $(LIB_DIR)/libapi.a
-CFLAGS += -Wall -Wextra -I../lib/
-LDFLAGS += $(LIBS)
+CFLAGS += -Wall -Wextra -I../lib/ -pthread
+LDFLAGS += $(LIBS) -pthread
-all: $(TARGETS)
+all: $(BUILD_TARGETS)
-$(TARGETS): $(LIBS)
+$(BUILD_TARGETS): $(LIBS)
$(LIBS):
make -C $(LIB_DIR)
@@ -22,11 +23,11 @@ $(LIBS):
$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
clean:
- $(RM) page-types slabinfo page_owner_sort
+ $(RM) page-types slabinfo page_owner_sort thp_swap_allocator_test
make -C $(LIB_DIR) clean
sbindir ?= /usr/sbin
install: all
install -d $(DESTDIR)$(sbindir)
- install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+ install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 8d5595b6c59f..d7e5e8902af8 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -22,9 +22,10 @@
#include <time.h>
#include <setjmp.h>
#include <signal.h>
+#include <inttypes.h>
#include <sys/types.h>
-#include <sys/errno.h>
-#include <sys/fcntl.h>
+#include <errno.h>
+#include <fcntl.h>
#include <sys/mount.h>
#include <sys/statfs.h>
#include <sys/mman.h>
@@ -71,12 +72,12 @@
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
#define KPF_MLOCKED 33
-#define KPF_MAPPEDTODISK 34
+#define KPF_OWNER_2 34
#define KPF_PRIVATE 35
#define KPF_PRIVATE_2 36
#define KPF_OWNER_PRIVATE 37
#define KPF_ARCH 38
-#define KPF_UNCACHED 39
+#define KPF_UNCACHED 39 /* unused */
#define KPF_SOFTDIRTY 40
#define KPF_ARCH_2 41
@@ -129,12 +130,11 @@ static const char * const page_flag_names[] = {
[KPF_RESERVED] = "r:reserved",
[KPF_MLOCKED] = "m:mlocked",
- [KPF_MAPPEDTODISK] = "d:mappedtodisk",
+ [KPF_OWNER_2] = "d:owner_2",
[KPF_PRIVATE] = "P:private",
[KPF_PRIVATE_2] = "p:private_2",
[KPF_OWNER_PRIVATE] = "O:owner_private",
[KPF_ARCH] = "h:arch",
- [KPF_UNCACHED] = "c:uncached",
[KPF_SOFTDIRTY] = "f:softdirty",
[KPF_ARCH_2] = "H:arch_2",
@@ -392,9 +392,9 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
if (opt_file)
printf("%lx\t", voff);
if (opt_list_cgroup)
- printf("@%llu\t", (unsigned long long)cgroup0);
+ printf("@%" PRIu64 "\t", cgroup0);
if (opt_list_mapcnt)
- printf("%lu\t", mapcnt0);
+ printf("%" PRIu64 "\t", mapcnt0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
@@ -420,9 +420,9 @@ static void show_page(unsigned long voffset, unsigned long offset,
if (opt_file)
printf("%lx\t", voffset);
if (opt_list_cgroup)
- printf("@%llu\t", (unsigned long long)cgroup);
+ printf("@%" PRIu64 "\t", cgroup);
if (opt_list_mapcnt)
- printf("%lu\t", mapcnt);
+ printf("%" PRIu64 "\t", mapcnt);
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
@@ -472,9 +472,9 @@ static int bit_mask_ok(uint64_t flags)
static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
{
- /* Anonymous pages overload PG_mappedtodisk */
- if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
- flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
+ /* Anonymous pages use PG_owner_2 for anon_exclusive */
+ if ((flags & BIT(ANON)) && (flags & BIT(OWNER_2)))
+ flags ^= BIT(OWNER_2) | BIT(ANON_EXCLUSIVE);
/* SLUB overloads several page flags */
if (flags & BIT(SLAB)) {
diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c
index 99798894b879..14c67e9e84c4 100644
--- a/tools/mm/page_owner_sort.c
+++ b/tools/mm/page_owner_sort.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
+#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -23,9 +24,6 @@
#include <linux/types.h>
#include <getopt.h>
-#define bool int
-#define true 1
-#define false 0
#define TASK_COMM_LEN 16
struct block_list {
@@ -33,7 +31,6 @@ struct block_list {
char *comm; // task command name
char *stacktrace;
__u64 ts_nsec;
- __u64 free_ts_nsec;
int len;
int num;
int page_num;
@@ -42,18 +39,16 @@ struct block_list {
int allocator;
};
enum FILTER_BIT {
- FILTER_UNRELEASE = 1<<1,
- FILTER_PID = 1<<2,
- FILTER_TGID = 1<<3,
- FILTER_COMM = 1<<4
+ FILTER_PID = 1<<1,
+ FILTER_TGID = 1<<2,
+ FILTER_COMM = 1<<3
};
enum CULL_BIT {
- CULL_UNRELEASE = 1<<1,
- CULL_PID = 1<<2,
- CULL_TGID = 1<<3,
- CULL_COMM = 1<<4,
- CULL_STACKTRACE = 1<<5,
- CULL_ALLOCATOR = 1<<6
+ CULL_PID = 1<<1,
+ CULL_TGID = 1<<2,
+ CULL_COMM = 1<<3,
+ CULL_STACKTRACE = 1<<4,
+ CULL_ALLOCATOR = 1<<5
};
enum ALLOCATOR_BIT {
ALLOCATOR_CMA = 1<<1,
@@ -62,14 +57,23 @@ enum ALLOCATOR_BIT {
ALLOCATOR_OTHERS = 1<<4
};
enum ARG_TYPE {
- ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS,
- ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE,
- ARG_ALLOCATOR
+ ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_CULL_TIME,
+ ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_ALLOCATOR
};
enum SORT_ORDER {
SORT_ASC = 1,
SORT_DESC = -1,
};
+enum COMP_FLAG {
+ COMP_NO_FLAG = 0,
+ COMP_ALLOC = 1<<0,
+ COMP_PAGE_NUM = 1<<1,
+ COMP_PID = 1<<2,
+ COMP_STACK = 1<<3,
+ COMP_NUM = 1<<4,
+ COMP_TGID = 1<<5,
+ COMP_COMM = 1<<6
+};
struct filter_condition {
pid_t *pids;
pid_t *tgids;
@@ -90,7 +94,6 @@ static regex_t pid_pattern;
static regex_t tgid_pattern;
static regex_t comm_pattern;
static regex_t ts_nsec_pattern;
-static regex_t free_ts_nsec_pattern;
static struct block_list *list;
static int list_size;
static int max_size;
@@ -181,24 +184,6 @@ static int compare_ts(const void *p1, const void *p2)
return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
}
-static int compare_free_ts(const void *p1, const void *p2)
-{
- const struct block_list *l1 = p1, *l2 = p2;
-
- return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1;
-}
-
-static int compare_release(const void *p1, const void *p2)
-{
- const struct block_list *l1 = p1, *l2 = p2;
-
- if (!l1->free_ts_nsec && !l2->free_ts_nsec)
- return 0;
- if (l1->free_ts_nsec && l2->free_ts_nsec)
- return 0;
- return l1->free_ts_nsec ? 1 : -1;
-}
-
static int compare_cull_condition(const void *p1, const void *p2)
{
if (cull == 0)
@@ -211,8 +196,6 @@ static int compare_cull_condition(const void *p1, const void *p2)
return compare_tgid(p1, p2);
if ((cull & CULL_COMM) && compare_comm(p1, p2))
return compare_comm(p1, p2);
- if ((cull & CULL_UNRELEASE) && compare_release(p1, p2))
- return compare_release(p1, p2);
if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2))
return compare_allocator(p1, p2);
return 0;
@@ -228,6 +211,21 @@ static int compare_sort_condition(const void *p1, const void *p2)
return cmp;
}
+static int remove_pattern(regex_t *pattern, char *buf, int len)
+{
+ regmatch_t pmatch[2];
+ int err;
+
+ err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
+ if (err != 0 || pmatch[1].rm_so == -1)
+ return len;
+
+ memcpy(buf + pmatch[1].rm_so,
+ buf + pmatch[1].rm_eo, len - pmatch[1].rm_eo);
+
+ return len - (pmatch[1].rm_eo - pmatch[1].rm_so);
+}
+
static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
{
int err, val_len;
@@ -366,24 +364,6 @@ static __u64 get_ts_nsec(char *buf)
return ts_nsec;
}
-static __u64 get_free_ts_nsec(char *buf)
-{
- __u64 free_ts_nsec;
- char free_ts_nsec_str[FIELD_BUFF] = {0};
- char *endptr;
-
- search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf);
- errno = 0;
- free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10);
- if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') {
- if (debug_on)
- fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf);
- return -1;
- }
-
- return free_ts_nsec;
-}
-
static char *get_comm(char *buf)
{
char *comm_str = malloc(TASK_COMM_LEN);
@@ -395,6 +375,7 @@ static char *get_comm(char *buf)
if (errno != 0) {
if (debug_on)
fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf);
+ free(comm_str);
return NULL;
}
@@ -411,12 +392,8 @@ static int get_arg_type(const char *arg)
return ARG_COMM;
else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st"))
return ARG_STACKTRACE;
- else if (!strcmp(arg, "free") || !strcmp(arg, "f"))
- return ARG_FREE;
else if (!strcmp(arg, "txt") || !strcmp(arg, "T"))
return ARG_TXT;
- else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft"))
- return ARG_FREE_TS;
else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at"))
return ARG_ALLOC_TS;
else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator"))
@@ -471,13 +448,6 @@ static bool match_str_list(const char *str, char **list, int list_size)
static bool is_need(char *buf)
{
- __u64 ts_nsec, free_ts_nsec;
-
- ts_nsec = get_ts_nsec(buf);
- free_ts_nsec = get_free_ts_nsec(buf);
-
- if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec)
- return false;
if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
return false;
if ((filter & FILTER_TGID) &&
@@ -497,13 +467,6 @@ static bool is_need(char *buf)
static bool add_list(char *buf, int len, char *ext_buf)
{
- if (list_size != 0 &&
- len == list[list_size-1].len &&
- memcmp(buf, list[list_size-1].txt, len) == 0) {
- list[list_size-1].num++;
- list[list_size-1].page_num += get_page_num(buf);
- return true;
- }
if (list_size == max_size) {
fprintf(stderr, "max_size too small??\n");
return false;
@@ -519,6 +482,9 @@ static bool add_list(char *buf, int len, char *ext_buf)
return false;
}
memcpy(list[list_size].txt, buf, len);
+ if (sc.cmps[0] != compare_ts) {
+ len = remove_pattern(&ts_nsec_pattern, list[list_size].txt, len);
+ }
list[list_size].txt[len] = 0;
list[list_size].len = len;
list[list_size].num = 1;
@@ -528,7 +494,6 @@ static bool add_list(char *buf, int len, char *ext_buf)
if (*list[list_size].stacktrace == '\n')
list[list_size].stacktrace++;
list[list_size].ts_nsec = get_ts_nsec(buf);
- list[list_size].free_ts_nsec = get_free_ts_nsec(buf);
list[list_size].allocator = get_allocator(buf, ext_buf);
list_size++;
if (list_size % 1000 == 0) {
@@ -554,8 +519,6 @@ static bool parse_cull_args(const char *arg_str)
cull |= CULL_COMM;
else if (arg_type == ARG_STACKTRACE)
cull |= CULL_STACKTRACE;
- else if (arg_type == ARG_FREE)
- cull |= CULL_UNRELEASE;
else if (arg_type == ARG_ALLOCATOR)
cull |= CULL_ALLOCATOR;
else {
@@ -616,8 +579,6 @@ static bool parse_sort_args(const char *arg_str)
sc.cmps[i] = compare_stacktrace;
else if (arg_type == ARG_ALLOC_TS)
sc.cmps[i] = compare_ts;
- else if (arg_type == ARG_FREE_TS)
- sc.cmps[i] = compare_free_ts;
else if (arg_type == ARG_TXT)
sc.cmps[i] = compare_txt;
else if (arg_type == ARG_ALLOCATOR)
@@ -672,21 +633,26 @@ static void print_allocator(FILE *out, int allocator)
static void usage(void)
{
printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
- "-m\t\tSort by total memory.\n"
- "-s\t\tSort by the stack trace.\n"
- "-t\t\tSort by times (default).\n"
- "-p\t\tSort by pid.\n"
- "-P\t\tSort by tgid.\n"
- "-n\t\tSort by task command name.\n"
- "-a\t\tSort by memory allocate time.\n"
- "-r\t\tSort by memory release time.\n"
- "-f\t\tFilter out the information of blocks whose memory has been released.\n"
- "-d\t\tPrint debug information.\n"
- "--pid <pidlist>\tSelect by pid. This selects the information of blocks whose process ID numbers appear in <pidlist>.\n"
- "--tgid <tgidlist>\tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in <tgidlist>.\n"
- "--name <cmdlist>\n\t\tSelect by command name. This selects the information of blocks whose command name appears in <cmdlist>.\n"
- "--cull <rules>\tCull by user-defined rules.<rules> is a single argument in the form of a comma-separated list with some common fields predefined\n"
- "--sort <order>\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n"
+ "-a\t\t\tSort by memory allocation time.\n"
+ "-m\t\t\tSort by total memory.\n"
+ "-n\t\t\tSort by task command name.\n"
+ "-p\t\t\tSort by pid.\n"
+ "-P\t\t\tSort by tgid.\n"
+ "-s\t\t\tSort by the stacktrace.\n"
+ "-t\t\t\tSort by number of times record is seen (default).\n\n"
+ "--pid <pidlist>\t\tSelect by pid. This selects the information"
+ " of\n\t\t\tblocks whose process ID numbers appear in <pidlist>.\n"
+ "--tgid <tgidlist>\tSelect by tgid. This selects the information"
+ " of\n\t\t\tblocks whose Thread Group ID numbers appear in "
+ "<tgidlist>.\n"
+ "--name <cmdlist>\tSelect by command name. This selects the"
+ " information\n\t\t\tof blocks whose command name appears in"
+ " <cmdlist>.\n"
+ "--cull <rules>\t\tCull by user-defined rules. <rules> is a "
+ "single\n\t\t\targument in the form of a comma-separated list "
+ "with some\n\t\t\tcommon fields predefined (pid, tgid, comm, "
+ "stacktrace, allocator)\n"
+ "--sort <order>\t\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n"
);
}
@@ -694,50 +660,50 @@ int main(int argc, char **argv)
{
FILE *fin, *fout;
char *buf, *ext_buf;
- int i, count;
+ int i, count, compare_flag;
struct stat st;
int opt;
struct option longopts[] = {
{ "pid", required_argument, NULL, 1 },
{ "tgid", required_argument, NULL, 2 },
{ "name", required_argument, NULL, 3 },
- { "cull", required_argument, NULL, 4 },
- { "sort", required_argument, NULL, 5 },
+ { "cull", required_argument, NULL, 4 },
+ { "sort", required_argument, NULL, 5 },
+ { "help", no_argument, NULL, 'h' },
{ 0, 0, 0, 0},
};
- while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1)
+ compare_flag = COMP_NO_FLAG;
+
+ while ((opt = getopt_long(argc, argv, "admnpstPh", longopts, NULL)) != -1)
switch (opt) {
case 'a':
- set_single_cmp(compare_ts, SORT_ASC);
+ compare_flag |= COMP_ALLOC;
break;
case 'd':
debug_on = true;
break;
- case 'f':
- filter = filter | FILTER_UNRELEASE;
- break;
case 'm':
- set_single_cmp(compare_page_num, SORT_DESC);
+ compare_flag |= COMP_PAGE_NUM;
break;
case 'p':
- set_single_cmp(compare_pid, SORT_ASC);
- break;
- case 'r':
- set_single_cmp(compare_free_ts, SORT_ASC);
+ compare_flag |= COMP_PID;
break;
case 's':
- set_single_cmp(compare_stacktrace, SORT_ASC);
+ compare_flag |= COMP_STACK;
break;
case 't':
- set_single_cmp(compare_num, SORT_DESC);
+ compare_flag |= COMP_NUM;
break;
case 'P':
- set_single_cmp(compare_tgid, SORT_ASC);
+ compare_flag |= COMP_TGID;
break;
case 'n':
- set_single_cmp(compare_comm, SORT_ASC);
+ compare_flag |= COMP_COMM;
break;
+ case 'h':
+ usage();
+ exit(0);
case 1:
filter = filter | FILTER_PID;
fc.pids = parse_nums_list(optarg, &fc.pids_size);
@@ -784,6 +750,39 @@ int main(int argc, char **argv)
exit(1);
}
+ /* Only one compare option is allowed, yet we also want handle the
+ * default case were no option is provided, but we still want to
+ * match the behavior of the -t option (compare by number of times
+ * a record is seen
+ */
+ switch (compare_flag) {
+ case COMP_ALLOC:
+ set_single_cmp(compare_ts, SORT_ASC);
+ break;
+ case COMP_PAGE_NUM:
+ set_single_cmp(compare_page_num, SORT_DESC);
+ break;
+ case COMP_PID:
+ set_single_cmp(compare_pid, SORT_ASC);
+ break;
+ case COMP_STACK:
+ set_single_cmp(compare_stacktrace, SORT_ASC);
+ break;
+ case COMP_NO_FLAG:
+ case COMP_NUM:
+ set_single_cmp(compare_num, SORT_DESC);
+ break;
+ case COMP_TGID:
+ set_single_cmp(compare_tgid, SORT_ASC);
+ break;
+ case COMP_COMM:
+ set_single_cmp(compare_comm, SORT_ASC);
+ break;
+ default:
+ usage();
+ exit(1);
+ }
+
fin = fopen(argv[optind], "r");
fout = fopen(argv[optind + 1], "w");
if (!fin || !fout) {
@@ -800,10 +799,8 @@ int main(int argc, char **argv)
goto out_tgid;
if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
goto out_comm;
- if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"))
+ if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns"))
goto out_ts;
- if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"))
- goto out_free_ts;
fstat(fileno(fin), &st);
max_size = st.st_size / 100; /* hack ... */
@@ -864,9 +861,6 @@ int main(int argc, char **argv)
fprintf(fout, ", ");
print_allocator(fout, list[i].allocator);
}
- if (cull & CULL_UNRELEASE)
- fprintf(fout, " (%s)",
- list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED");
if (cull & CULL_STACKTRACE)
fprintf(fout, ":\n%s", list[i].stacktrace);
fprintf(fout, "\n");
@@ -880,8 +874,6 @@ out_free:
free(buf);
if (list)
free(list);
-out_free_ts:
- regfree(&free_ts_nsec_pattern);
out_ts:
regfree(&ts_nsec_pattern);
out_comm:
diff --git a/tools/mm/show_page_info.py b/tools/mm/show_page_info.py
new file mode 100644
index 000000000000..c46d8ea283d7
--- /dev/null
+++ b/tools/mm/show_page_info.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env drgn
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2025 Ye Liu <liuye@kylinos.cn>
+
+import argparse
+import sys
+from drgn import Object, FaultError, PlatformFlags, cast
+from drgn.helpers.linux import find_task, follow_page, page_size
+from drgn.helpers.linux.mm import (
+ decode_page_flags, page_to_pfn, page_to_phys, page_to_virt, vma_find,
+ PageSlab, PageCompound, PageHead, PageTail, compound_head, compound_order, compound_nr
+)
+from drgn.helpers.linux.cgroup import cgroup_name, cgroup_path
+
+DESC = """
+This is a drgn script to show the page state.
+For more info on drgn, visit https://github.com/osandov/drgn.
+"""
+
+def format_page_data(page):
+ """
+ Format raw page data into a readable hex dump with "RAW:" prefix.
+
+ :param page: drgn.Object instance representing the page.
+ :return: Formatted string of memory contents.
+ """
+ try:
+ address = page.value_()
+ size = prog.type("struct page").size
+
+ if prog.platform.flags & PlatformFlags.IS_64_BIT:
+ word_size = 8
+ else:
+ word_size = 4
+ num_words = size // word_size
+
+ values = []
+ for i in range(num_words):
+ word_address = address + i * word_size
+ word = prog.read_word(word_address)
+ values.append(f"{word:0{word_size * 2}x}")
+
+ lines = [f"RAW: {' '.join(values[i:i + 4])}" for i in range(0, len(values), 4)]
+
+ return "\n".join(lines)
+
+ except FaultError as e:
+ return f"Error reading memory: {e}"
+ except Exception as e:
+ return f"Unexpected error: {e}"
+
+def get_memcg_info(page):
+ """Retrieve memory cgroup information for a page."""
+ try:
+ MEMCG_DATA_OBJEXTS = prog.constant("MEMCG_DATA_OBJEXTS").value_()
+ MEMCG_DATA_KMEM = prog.constant("MEMCG_DATA_KMEM").value_()
+ mask = prog.constant('__NR_MEMCG_DATA_FLAGS').value_() - 1
+ memcg_data = page.memcg_data.read_()
+ if memcg_data & MEMCG_DATA_OBJEXTS:
+ slabobj_ext = cast("struct slabobj_ext *", memcg_data & ~mask)
+ memcg = slabobj_ext.objcg.memcg.value_()
+ elif memcg_data & MEMCG_DATA_KMEM:
+ objcg = cast("struct obj_cgroup *", memcg_data & ~mask)
+ memcg = objcg.memcg.value_()
+ else:
+ memcg = cast("struct mem_cgroup *", memcg_data & ~mask)
+
+ if memcg.value_() == 0:
+ return "none", "/sys/fs/cgroup/memory/"
+ cgrp = memcg.css.cgroup
+ return cgroup_name(cgrp).decode(), f"/sys/fs/cgroup/memory{cgroup_path(cgrp).decode()}"
+ except FaultError as e:
+ return "unknown", f"Error retrieving memcg info: {e}"
+ except Exception as e:
+ return "unknown", f"Unexpected error: {e}"
+
+def show_page_state(page, addr, mm, pid, task):
+ """Display detailed information about a page."""
+ try:
+ print(f'PID: {pid} Comm: {task.comm.string_().decode()} mm: {hex(mm)}')
+ try:
+ print(format_page_data(page))
+ except FaultError as e:
+ print(f"Error reading page data: {e}")
+ fields = {
+ "Page Address": hex(page.value_()),
+ "Page Flags": decode_page_flags(page),
+ "Page Size": prog["PAGE_SIZE"].value_(),
+ "Page PFN": hex(page_to_pfn(page).value_()),
+ "Page Physical": hex(page_to_phys(page).value_()),
+ "Page Virtual": hex(page_to_virt(page).value_()),
+ "Page Refcount": page._refcount.counter.value_(),
+ "Page Mapcount": page._mapcount.counter.value_(),
+ "Page Index": hex(page.__folio_index.value_()),
+ "Page Memcg Data": hex(page.memcg_data.value_()),
+ }
+
+ memcg_name, memcg_path = get_memcg_info(page)
+ fields["Memcg Name"] = memcg_name
+ fields["Memcg Path"] = memcg_path
+ fields["Page Mapping"] = hex(page.mapping.value_())
+ fields["Page Anon/File"] = "Anon" if page.mapping.value_() & 0x1 else "File"
+
+ try:
+ vma = vma_find(mm, addr)
+ fields["Page VMA"] = hex(vma.value_())
+ fields["VMA Start"] = hex(vma.vm_start.value_())
+ fields["VMA End"] = hex(vma.vm_end.value_())
+ except FaultError as e:
+ fields["Page VMA"] = "Unavailable"
+ fields["VMA Start"] = "Unavailable"
+ fields["VMA End"] = "Unavailable"
+ print(f"Error retrieving VMA information: {e}")
+
+ # Calculate the maximum field name length for alignment
+ max_field_len = max(len(field) for field in fields)
+
+ # Print aligned fields
+ for field, value in fields.items():
+ print(f"{field}:".ljust(max_field_len + 2) + f"{value}")
+
+ # Additional information about the page
+ if PageSlab(page):
+ print("This page belongs to the slab allocator.")
+
+ if PageCompound(page):
+ print("This page is part of a compound page.")
+ if PageHead(page):
+ print("This page is the head page of a compound page.")
+ if PageTail(page):
+ print("This page is the tail page of a compound page.")
+ print(f"{'Head Page:'.ljust(max_field_len + 2)}{hex(compound_head(page).value_())}")
+ print(f"{'Compound Order:'.ljust(max_field_len + 2)}{compound_order(page).value_()}")
+ print(f"{'Number of Pages:'.ljust(max_field_len + 2)}{compound_nr(page).value_()}")
+ else:
+ print("This page is not part of a compound page.")
+ except FaultError as e:
+ print(f"Error accessing page state: {e}")
+ except Exception as e:
+ print(f"Unexpected error: {e}")
+
+def main():
+ """Main function to parse arguments and display page state."""
+ parser = argparse.ArgumentParser(description=DESC, formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument('pid', metavar='PID', type=int, help='Target process ID (PID)')
+ parser.add_argument('vaddr', metavar='VADDR', type=str, help='Target virtual address in hexadecimal format (e.g., 0x7fff1234abcd)')
+ args = parser.parse_args()
+
+ try:
+ vaddr = int(args.vaddr, 16)
+ except ValueError:
+ sys.exit(f"Error: Invalid virtual address format: {args.vaddr}")
+
+ try:
+ task = find_task(args.pid)
+ mm = task.mm
+ page = follow_page(mm, vaddr)
+
+ if page:
+ show_page_state(page, vaddr, mm, args.pid, task)
+ else:
+ sys.exit(f"Address {hex(vaddr)} is not mapped.")
+ except FaultError as e:
+ sys.exit(f"Error accessing task or memory: {e}")
+ except Exception as e:
+ sys.exit(f"Unexpected error: {e}")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
index cfaeaea71042..80cdbd3db82d 100644
--- a/tools/mm/slabinfo.c
+++ b/tools/mm/slabinfo.c
@@ -21,7 +21,7 @@
#include <regex.h>
#include <errno.h>
-#define MAX_SLABS 500
+#define MAX_SLABS 2000
#define MAX_ALIASES 500
#define MAX_NODES 1024
@@ -155,6 +155,7 @@ static void usage(void)
static unsigned long read_obj(const char *name)
{
+ size_t len;
FILE *f = fopen(name, "r");
if (!f) {
@@ -165,8 +166,10 @@ static unsigned long read_obj(const char *name)
if (!fgets(buffer, sizeof(buffer), f))
buffer[0] = 0;
fclose(f);
- if (buffer[strlen(buffer)] == '\n')
- buffer[strlen(buffer)] = 0;
+ len = strlen(buffer);
+
+ if (len > 0 && buffer[len - 1] == '\n')
+ buffer[len - 1] = 0;
}
return strlen(buffer);
}
@@ -1228,6 +1231,8 @@ static void read_slab_dir(void)
continue;
switch (de->d_type) {
case DT_LNK:
+ if (alias - aliasinfo == MAX_ALIASES)
+ fatal("Too many aliases\n");
alias->name = strdup(de->d_name);
count = readlink(de->d_name, buffer, sizeof(buffer)-1);
@@ -1242,6 +1247,8 @@ static void read_slab_dir(void)
alias++;
break;
case DT_DIR:
+ if (slab - slabinfo == MAX_SLABS)
+ fatal("Too many slabs\n");
if (chdir(de->d_name))
fatal("Unable to access slab %s\n", slab->name);
slab->name = strdup(de->d_name);
@@ -1297,7 +1304,9 @@ static void read_slab_dir(void)
slab->cpu_partial_free = get_obj("cpu_partial_free");
slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
slab->deactivate_bypass = get_obj("deactivate_bypass");
- chdir("..");
+ if (chdir(".."))
+ fatal("Unable to chdir from slab ../%s\n",
+ slab->name);
if (slab->name[0] == ':')
alias_targets++;
slab++;
@@ -1310,10 +1319,6 @@ static void read_slab_dir(void)
slabs = slab - slabinfo;
actual_slabs = slabs;
aliases = alias - aliasinfo;
- if (slabs > MAX_SLABS)
- fatal("Too many slabs\n");
- if (aliases > MAX_ALIASES)
- fatal("Too many aliases\n");
}
static void output_slabs(void)
diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c
new file mode 100644
index 000000000000..83afc52275a5
--- /dev/null
+++ b/tools/mm/thp_swap_allocator_test.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * thp_swap_allocator_test
+ *
+ * The purpose of this test program is helping check if THP swpout
+ * can correctly get swap slots to swap out as a whole instead of
+ * being split. It randomly releases swap entries through madvise
+ * DONTNEED and swapin/out on two memory areas: a memory area for
+ * 64KB THP and the other area for small folios. The second memory
+ * can be enabled by "-s".
+ * Before running the program, we need to setup a zRAM or similar
+ * swap device by:
+ * echo lzo > /sys/block/zram0/comp_algorithm
+ * echo 64M > /sys/block/zram0/disksize
+ * echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+ * echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
+ * mkswap /dev/zram0
+ * swapon /dev/zram0
+ * The expected result should be 0% anon swpout fallback ratio w/ or
+ * w/o "-s".
+ *
+ * Author(s): Barry Song <v-songbaohua@oppo.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <time.h>
+
+#define MEMSIZE_MTHP (60 * 1024 * 1024)
+#define MEMSIZE_SMALLFOLIO (4 * 1024 * 1024)
+#define ALIGNMENT_MTHP (64 * 1024)
+#define ALIGNMENT_SMALLFOLIO (4 * 1024)
+#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
+#define TOTAL_DONTNEED_SMALLFOLIO (1 * 1024 * 1024)
+#define MTHP_FOLIO_SIZE (64 * 1024)
+
+#define SWPOUT_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
+#define SWPOUT_FALLBACK_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
+
+static void *aligned_alloc_mem(size_t size, size_t alignment)
+{
+ void *mem = NULL;
+
+ if (posix_memalign(&mem, alignment, size) != 0) {
+ perror("posix_memalign");
+ return NULL;
+ }
+ return mem;
+}
+
+/*
+ * This emulates the behavior of native libc and Java heap,
+ * as well as process exit and munmap. It helps generate mTHP
+ * and ensures that iterations can proceed with mTHP, as we
+ * currently don't support large folios swap-in.
+ */
+static void random_madvise_dontneed(void *mem, size_t mem_size,
+ size_t align_size, size_t total_dontneed_size)
+{
+ size_t num_pages = total_dontneed_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ if (madvise(addr, align_size, MADV_DONTNEED) != 0)
+ perror("madvise dontneed");
+
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static void random_swapin(void *mem, size_t mem_size,
+ size_t align_size, size_t total_swapin_size)
+{
+ size_t num_pages = total_swapin_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static unsigned long read_stat(const char *path)
+{
+ FILE *file;
+ unsigned long value;
+
+ file = fopen(path, "r");
+ if (!file) {
+ perror("fopen");
+ return 0;
+ }
+
+ if (fscanf(file, "%lu", &value) != 1) {
+ perror("fscanf");
+ fclose(file);
+ return 0;
+ }
+
+ fclose(file);
+ return value;
+}
+
+int main(int argc, char *argv[])
+{
+ int use_small_folio = 0, aligned_swapin = 0;
+ void *mem1 = NULL, *mem2 = NULL;
+ int i;
+
+ for (i = 1; i < argc; ++i) {
+ if (strcmp(argv[i], "-s") == 0)
+ use_small_folio = 1;
+ else if (strcmp(argv[i], "-a") == 0)
+ aligned_swapin = 1;
+ }
+
+ mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
+ if (mem1 == NULL) {
+ fprintf(stderr, "Failed to allocate large folios memory\n");
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
+ perror("madvise hugepage for mem1");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
+ if (mem2 == NULL) {
+ fprintf(stderr, "Failed to allocate small folios memory\n");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
+ perror("madvise nohugepage for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* warm-up phase to occupy the swapfile */
+ memset(mem1, 0x11, MEMSIZE_MTHP);
+ madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT);
+ if (use_small_folio) {
+ memset(mem2, 0x11, MEMSIZE_SMALLFOLIO);
+ madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT);
+ }
+
+ /* iterations with newly created mTHP, swap-in, and swap-out */
+ for (i = 0; i < 100; ++i) {
+ unsigned long initial_swpout;
+ unsigned long initial_swpout_fallback;
+ unsigned long final_swpout;
+ unsigned long final_swpout_fallback;
+ unsigned long swpout_inc;
+ unsigned long swpout_fallback_inc;
+ double fallback_percentage;
+
+ initial_swpout = read_stat(SWPOUT_PATH);
+ initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ /*
+ * The following setup creates a 1:1 ratio of mTHP to small folios
+ * since large folio swap-in isn't supported yet. Once we support
+ * mTHP swap-in, we'll likely need to reduce MEMSIZE_MTHP and
+ * increase MEMSIZE_SMALLFOLIO to maintain the ratio.
+ */
+ random_swapin(mem1, MEMSIZE_MTHP,
+ aligned_swapin ? ALIGNMENT_MTHP : ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_MTHP);
+ random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
+ TOTAL_DONTNEED_MTHP);
+
+ if (use_small_folio) {
+ random_swapin(mem2, MEMSIZE_SMALLFOLIO,
+ ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_SMALLFOLIO);
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem1");
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ final_swpout = read_stat(SWPOUT_PATH);
+ final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ swpout_inc = final_swpout - initial_swpout;
+ swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
+
+ fallback_percentage = (double)swpout_fallback_inc /
+ (swpout_fallback_inc + swpout_inc) * 100;
+
+ printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
+ i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
+ }
+
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 000000000000..803e0318f2fe
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+ PAGE_SIZE = resource.getpagesize()
+ PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+ PMD_SIZE = int(f.read())
+ PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+ return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+ return v & (a - 1)
+
+
+def kbnr(kb):
+ # Convert KB to number of pages.
+ return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+ # Convert number of pages to KB.
+ return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+ # Convert page order to KB.
+ return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+ # Given a list of arrays, find the ranges for which values are monotonically
+ # incrementing in all arrays. all arrays in search and index must be the
+ # same size.
+ sz = len(search[0])
+ r = np.full(sz, 2)
+ d = np.diff(search[0]) == 1
+ for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+ d &= dd
+ r[1:] -= d
+ r[:-1] -= d
+ return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+ pass
+
+
+class FileIOException(Exception):
+ pass
+
+
+class BinArrayFile:
+ # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+ # numpy array. Use inherrited class in a with clause to ensure file is
+ # closed when it goes out of scope.
+ def __init__(self, filename, element_size):
+ self.element_size = element_size
+ self.filename = filename
+ self.fd = os.open(self.filename, os.O_RDONLY)
+
+ def cleanup(self):
+ os.close(self.fd)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.cleanup()
+
+ def _readin(self, offset, buffer):
+ length = os.preadv(self.fd, (buffer,), offset)
+ if len(buffer) != length:
+ raise FileIOException('error: {} failed to read {} bytes at {:x}'
+ .format(self.filename, len(buffer), offset))
+
+ def _toarray(self, buf):
+ assert(self.element_size == 8)
+ return np.frombuffer(buf, dtype=np.uint64)
+
+ def getv(self, vec):
+ vec *= self.element_size
+ offsets = vec[:, 0]
+ lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+ buf = bytearray(int(np.sum(lengths)))
+ view = memoryview(buf)
+ pos = 0
+ for offset, length in zip(offsets, lengths):
+ offset = int(offset)
+ length = int(length)
+ self._readin(offset, view[pos:pos+length])
+ pos += length
+ return self._toarray(buf)
+
+ def get(self, index, nr=1):
+ offset = index * self.element_size
+ length = nr * self.element_size
+ buf = bytearray(length)
+ self._readin(offset, buf)
+ return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+ # Read ranges of a given pid's pagemap into a numpy array.
+ def __init__(self, pid='self'):
+ super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+ # Read ranges of /proc/kpageflags into a numpy array.
+ def __init__(self):
+ super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+ "Size",
+ "Rss",
+ "Pss",
+ "Pss_Dirty",
+ "Shared_Clean",
+ "Shared_Dirty",
+ "Private_Clean",
+ "Private_Dirty",
+ "Referenced",
+ "Anonymous",
+ "KSM",
+ "LazyFree",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+ "Shared_Hugetlb",
+ "Private_Hugetlb",
+ "Swap",
+ "SwapPss",
+ "Locked",
+])
+
+vma_min_stats = set([
+ "Rss",
+ "Anonymous",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+ 'name',
+ 'start',
+ 'end',
+ 'read',
+ 'write',
+ 'execute',
+ 'private',
+ 'pgoff',
+ 'major',
+ 'minor',
+ 'inode',
+ 'stats',
+])
+
+class VMAList:
+ # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+ # instance to receive VMAs.
+ def __init__(self, pid='self', stats=[]):
+ self.vmas = []
+ with open(f'/proc/{pid}/smaps', 'r') as file:
+ for line in file:
+ elements = line.split()
+ if '-' in elements[0]:
+ start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+ major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+ self.vmas.append(VMA(
+ name=elements[5] if len(elements) == 6 else '',
+ start=start,
+ end=end,
+ read=elements[1][0] == 'r',
+ write=elements[1][1] == 'w',
+ execute=elements[1][2] == 'x',
+ private=elements[1][3] == 'p',
+ pgoff=int(elements[2], 16),
+ major=major,
+ minor=minor,
+ inode=int(elements[4], 16),
+ stats={},
+ ))
+ else:
+ param = elements[0][:-1]
+ if param in stats:
+ value = int(elements[1])
+ self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+ def __iter__(self):
+ yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the mapped THPs.
+ stats = {
+ 'file': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ 'anon': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ }
+
+ for rindex, rpfn in zip(ranges[0], ranges[2]):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ pfn_end = int(rpfn[1]) + 1
+
+ folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+ # Account pages for any partially mapped THP at the front. In that case,
+ # the first page of the range is a tail.
+ nr = (int(folios[0]) if len(folios) else index_end) - index_next
+ stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+ # Account pages for any partially mapped THP at the back. In that case,
+ # the next page after the range is a tail.
+ if len(folios):
+ flags = int(kpageflags.get(pfn_end)[0])
+ if flags & KPF_COMPOUND_TAIL:
+ nr = index_end - int(folios[-1])
+ folios = folios[:-1]
+ index_end -= nr
+ stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+ # Account fully mapped THPs in the middle of the range.
+ if len(folios):
+ folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+ folio_orders = np.log2(folio_nrs).astype(np.uint64)
+ for index, order in zip(folios, folio_orders):
+ index = int(index)
+ order = int(order)
+ nr = 1 << order
+ vfn = int(vfns[index])
+ align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+ anon = 'anon' if anons[index] else 'file'
+ stats[anon][align][order] += nr
+
+ # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+ # race between acquiring the smaps stats and reading pagemap, where memory
+ # could be deallocated. So clamp to zero incase it would have gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+ stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ def flatten_sub(type, subtype, stats):
+ param = f"{type}-thp-pte-{subtype}-{{}}kB"
+ for od, nr in enumerate(stats[2:], 2):
+ rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+ def flatten_type(type, stats):
+ flatten_sub(type, 'aligned', stats['aligned'])
+ flatten_sub(type, 'unaligned', stats['unaligned'])
+ rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+ flatten_type('anon', stats['anon'])
+ flatten_type('file', stats['file'])
+
+ return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the contiguous blocks.
+ nr_cont = 1 << order
+ nr_anon = 0
+ nr_file = 0
+
+ for rindex, rvfn, rpfn in zip(*ranges):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ vfn_start = int(rvfn[0])
+ pfn_start = int(rpfn[0])
+
+ if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+ continue
+
+ off = align_forward(vfn_start, nr_cont) - vfn_start
+ index_next += off
+
+ while index_next + nr_cont <= index_end:
+ folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+ if not folio_boundary:
+ if anons[index_next]:
+ nr_anon += nr_cont
+ else:
+ nr_file += nr_cont
+ index_next += nr_cont
+
+ # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+ # There is a race between acquiring the smaps stats and reading pagemap,
+ # where memory could be deallocated. So clamp to zero incase it would have
+ # gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+ nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+ rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+ return rstats
+
+
+def vma_print(vma, pid):
+ # Prints a VMA instance in a format similar to smaps. The main difference is
+ # that the pid is included as the first value.
+ print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+ .format(
+ pid, vma.start, vma.end,
+ 'r' if vma.read else '-', 'w' if vma.write else '-',
+ 'x' if vma.execute else '-', 'p' if vma.private else 's',
+ vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+ ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+ # Print a statistics dictionary.
+ label_field = 32
+ for label, stat in stats.items():
+ type = stat['type']
+ value = stat['value']
+ if value or inc_empty:
+ pad = max(0, label_field - len(label) - 1)
+ if type == 'anon' and tot_anon > 0:
+ percent = f' ({value / tot_anon:3.0%})'
+ elif type == 'file' and tot_file > 0:
+ percent = f' ({value / tot_file:3.0%})'
+ else:
+ percent = ''
+ print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+ # Generate thp and cont statistics for a single VMA.
+ start = vma.start >> PAGE_SHIFT
+ end = vma.end >> PAGE_SHIFT
+
+ pmes = pagemap.get(start, end - start)
+ present = pmes & PM_PAGE_PRESENT != 0
+ pfns = pmes & PM_PFN_MASK
+ pfns = pfns[present]
+ vfns = np.arange(start, end, dtype=np.uint64)
+ vfns = vfns[present]
+
+ pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+ flags = kpageflags.getv(pfn_vec)
+ anons = flags & KPF_ANON != 0
+ heads = flags & KPF_COMPOUND_HEAD != 0
+ thps = flags & KPF_THP != 0
+
+ vfns = vfns[thps]
+ pfns = pfns[thps]
+ anons = anons[thps]
+ heads = heads[thps]
+
+ indexes = np.arange(len(vfns), dtype=np.uint64)
+ ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+ thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+ contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+ tot_anon = vma.stats['Anonymous']['value']
+ tot_file = vma.stats['Rss']['value'] - tot_anon
+
+ return {
+ **thpstats,
+ **{k: v for s in contstats for k, v in s.items()}
+ }, tot_anon, tot_file
+
+
+def do_main(args):
+ pids = set()
+ rollup = {}
+ rollup_anon = 0
+ rollup_file = 0
+
+ if args.cgroup:
+ strict = False
+ for walk_info in os.walk(args.cgroup):
+ cgroup = walk_info[0]
+ with open(f'{cgroup}/cgroup.procs') as pidfile:
+ for line in pidfile.readlines():
+ pids.add(int(line.strip()))
+ elif args.pid:
+ strict = True
+ pids = pids.union(args.pid)
+ else:
+ strict = False
+ for pid in os.listdir('/proc'):
+ if pid.isdigit():
+ pids.add(int(pid))
+
+ if not args.rollup:
+ print(" PID START END PROT OFFSET DEV INODE OBJECT")
+
+ for pid in pids:
+ try:
+ with PageMap(pid) as pagemap:
+ with KPageFlags() as kpageflags:
+ for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+ if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+ stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+ else:
+ stats = {}
+ vma_anon = 0
+ vma_file = 0
+ if args.inc_smaps:
+ stats = {**vma.stats, **stats}
+ if args.rollup:
+ for k, v in stats.items():
+ if k in rollup:
+ assert(rollup[k]['type'] == v['type'])
+ rollup[k]['value'] += v['value']
+ else:
+ rollup[k] = v
+ rollup_anon += vma_anon
+ rollup_file += vma_file
+ else:
+ vma_print(vma, pid)
+ stats_print(stats, vma_anon, vma_file, args.inc_empty)
+ except (FileNotFoundError, ProcessLookupError, FileIOException):
+ if strict:
+ raise
+
+ if args.rollup:
+ stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+ docs_width = shutil.get_terminal_size().columns
+ docs_width -= 2
+ docs_width = min(80, docs_width)
+
+ def format(string):
+ text = re.sub(r'\s+', ' ', string)
+ text = re.sub(r'\s*\\n\s*', '\n', text)
+ paras = text.split('\n')
+ paras = [textwrap.fill(p, width=docs_width) for p in paras]
+ return '\n'.join(paras)
+
+ def formatter(prog):
+ return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+ def size2order(human):
+ units = {
+ "K": 2**10, "M": 2**20, "G": 2**30,
+ "k": 2**10, "m": 2**20, "g": 2**30,
+ }
+ unit = 1
+ if human[-1] in units:
+ unit = units[human[-1]]
+ human = human[:-1]
+ try:
+ size = int(human)
+ except ValueError:
+ raise ArgException('error: --cont value must be integer size with optional KMG unit')
+ size *= unit
+ order = int(math.log2(size / PAGE_SIZE))
+ if order < 1:
+ raise ArgException('error: --cont value must be size of at least 2 pages')
+ if (1 << order) * PAGE_SIZE != size:
+ raise ArgException('error: --cont value must be size of power-of-2 pages')
+ if order > PMD_ORDER:
+ raise ArgException('error: --cont value must be less than or equal to PMD order')
+ return order
+
+ parser = argparse.ArgumentParser(formatter_class=formatter,
+ description=format("""Prints information about how transparent huge
+ pages are mapped, either system-wide, or for a specified
+ process or cgroup.\\n
+ \\n
+ When run with --pid, the user explicitly specifies the set
+ of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+ with --cgroup, the user passes either a v1 or v2 cgroup and
+ all pids that belong to the cgroup subtree are scanned. When
+ run with neither --pid nor --cgroup, the full set of pids on
+ the system is gathered from /proc and scanned as if the user
+ had provided "--pid 1 --pid 2 ...".\\n
+ \\n
+ A default set of statistics is always generated for THP
+ mappings. However, it is also possible to generate
+ additional statistics for "contiguous block mappings" where
+ the block size is user-defined.\\n
+ \\n
+ Statistics are maintained independently for anonymous and
+ file-backed (pagecache) memory and are shown both in kB and
+ as a percentage of either total anonymous or total
+ file-backed memory as appropriate.\\n
+ \\n
+ THP Statistics\\n
+ --------------\\n
+ \\n
+ Statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is aligned to
+ their size, for each <size> supported by the system.
+ Separate counters describe THPs mapped by PTE vs those
+ mapped by PMD. (Although note a THP can only be mapped by
+ PMD if it is PMD-sized):\\n
+ \\n
+ - anon-thp-pte-aligned-<size>kB\\n
+ - file-thp-pte-aligned-<size>kB\\n
+ - anon-thp-pmd-aligned-<size>kB\\n
+ - file-thp-pmd-aligned-<size>kB\\n
+ \\n
+ Similarly, statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is *not*
+ aligned to their size, for each <size> supported by the
+ system. Due to the unaligned mapping, it is impossible to
+ map by PMD, so there are only PTE counters for this case:\\n
+ \\n
+ - anon-thp-pte-unaligned-<size>kB\\n
+ - file-thp-pte-unaligned-<size>kB\\n
+ \\n
+ Statistics are also always generated for mapped pages that
+ belong to a THP but where the is THP is *not* fully- and
+ contiguously- mapped. These "partial" mappings are all
+ counted in the same counter regardless of the size of the
+ THP that is partially mapped:\\n
+ \\n
+ - anon-thp-pte-partial\\n
+ - file-thp-pte-partial\\n
+ \\n
+ Contiguous Block Statistics\\n
+ ---------------------------\\n
+ \\n
+ An optional, additional set of statistics is generated for
+ every contiguous block size specified with `--cont <size>`.
+ These statistics show how much memory is mapped in
+ contiguous blocks of <size> and also aligned to <size>. A
+ given contiguous block must all belong to the same THP, but
+ there is no requirement for it to be the *whole* THP.
+ Separate counters describe contiguous blocks mapped by PTE
+ vs those mapped by PMD:\\n
+ \\n
+ - anon-cont-pte-aligned-<size>kB\\n
+ - file-cont-pte-aligned-<size>kB\\n
+ - anon-cont-pmd-aligned-<size>kB\\n
+ - file-cont-pmd-aligned-<size>kB\\n
+ \\n
+ As an example, if monitoring 64K contiguous blocks (--cont
+ 64K), there are a number of sources that could provide such
+ blocks: a fully- and contiguously-mapped 64K THP that is
+ aligned to a 64K boundary would provide 1 block. A fully-
+ and contiguously-mapped 128K THP that is aligned to at least
+ a 64K boundary would provide 2 blocks. Or a 128K THP that
+ maps its first 100K, but contiguously and starting at a 64K
+ boundary would provide 1 block. A fully- and
+ contiguously-mapped 2M THP would provide 32 blocks. There
+ are many other possible permutations.\\n"""),
+ epilog=format("""Requires root privilege to access pagemap and
+ kpageflags."""))
+
+ group = parser.add_mutually_exclusive_group(required=False)
+ group.add_argument('--pid',
+ metavar='pid', required=False, type=int, default=[], action='append',
+ help="""Process id of the target process. Maybe issued multiple times to
+ scan multiple processes. --pid and --cgroup are mutually exclusive.
+ If neither are provided, all processes are scanned to provide
+ system-wide information.""")
+
+ group.add_argument('--cgroup',
+ metavar='path', required=False,
+ help="""Path to the target cgroup in sysfs. Iterates over every pid in
+ the cgroup and its children. --pid and --cgroup are mutually
+ exclusive. If neither are provided, all processes are scanned to
+ provide system-wide information.""")
+
+ parser.add_argument('--rollup',
+ required=False, default=False, action='store_true',
+ help="""Sum the per-vma statistics to provide a summary over the whole
+ system, process or cgroup.""")
+
+ parser.add_argument('--cont',
+ metavar='size[KMG]', required=False, default=[], action='append',
+ help="""Adds stats for memory that is mapped in contiguous blocks of
+ <size> and also aligned to <size>. May be issued multiple times to
+ track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+ hpa mappings. Size must be a power-of-2 number of pages.""")
+
+ parser.add_argument('--inc-smaps',
+ required=False, default=False, action='store_true',
+ help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+ output.""")
+
+ parser.add_argument('--inc-empty',
+ required=False, default=False, action='store_true',
+ help="""Show all statistics including those whose value is 0.""")
+
+ parser.add_argument('--periodic',
+ metavar='sleep_ms', required=False, type=int,
+ help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+ args = parser.parse_args()
+
+ try:
+ args.cont = [size2order(cont) for cont in args.cont]
+ except ArgException as e:
+ parser.print_usage()
+ raise
+
+ if args.periodic:
+ while True:
+ do_main(args)
+ print()
+ time.sleep(args.periodic / 1000)
+ else:
+ do_main(args)
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as e:
+ prog = os.path.basename(sys.argv[0])
+ print(f'{prog}: {e}')
+ exit(1)