summaryrefslogtreecommitdiff
path: root/tools/mm
diff options
context:
space:
mode:
Diffstat (limited to 'tools/mm')
-rw-r--r--tools/mm/.gitignore4
-rw-r--r--tools/mm/Makefile33
-rw-r--r--tools/mm/page-types.c1392
-rw-r--r--tools/mm/page_owner_sort.c889
-rw-r--r--tools/mm/show_page_info.py169
-rw-r--r--tools/mm/slabinfo-gnuplot.sh268
-rw-r--r--tools/mm/slabinfo.c1549
-rw-r--r--tools/mm/thp_swap_allocator_test.c234
-rw-r--r--tools/mm/thpmaps675
9 files changed, 5213 insertions, 0 deletions
diff --git a/tools/mm/.gitignore b/tools/mm/.gitignore
new file mode 100644
index 000000000000..922879f93fc8
--- /dev/null
+++ b/tools/mm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+slabinfo
+page-types
+page_owner_sort
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
new file mode 100644
index 000000000000..f5725b5c23aa
--- /dev/null
+++ b/tools/mm/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm tools
+#
+include ../scripts/Makefile.include
+
+BUILD_TARGETS=page-types slabinfo page_owner_sort thp_swap_allocator_test
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
+
+LIB_DIR = ../lib/api
+LIBS = $(LIB_DIR)/libapi.a
+
+CFLAGS += -Wall -Wextra -I../lib/ -pthread
+LDFLAGS += $(LIBS) -pthread
+
+all: $(BUILD_TARGETS)
+
+$(BUILD_TARGETS): $(LIBS)
+
+$(LIBS):
+ make -C $(LIB_DIR)
+
+%: %.c
+ $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+ $(RM) page-types slabinfo page_owner_sort thp_swap_allocator_test
+ make -C $(LIB_DIR) clean
+
+sbindir ?= /usr/sbin
+
+install: all
+ install -d $(DESTDIR)$(sbindir)
+ install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
new file mode 100644
index 000000000000..d7e5e8902af8
--- /dev/null
+++ b/tools/mm/page-types.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * page-types: Tool for querying page flags
+ *
+ * Copyright (C) 2009 Intel corporation
+ *
+ * Authors: Wu Fengguang <fengguang.wu@intel.com>
+ */
+
+#define _FILE_OFFSET_BITS 64
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <getopt.h>
+#include <limits.h>
+#include <assert.h>
+#include <ftw.h>
+#include <time.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mount.h>
+#include <sys/statfs.h>
+#include <sys/mman.h>
+#include "../../include/uapi/linux/magic.h"
+#include "../../include/uapi/linux/kernel-page-flags.h"
+#include <api/fs/fs.h>
+
+#ifndef MAX_PATH
+# define MAX_PATH 256
+#endif
+
+#ifndef STR
+# define _STR(x) #x
+# define STR(x) _STR(x)
+#endif
+
+/*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES 8
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define MAX_SWAPFILES_SHIFT 5
+#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
+#define PM_SOFT_DIRTY (1ULL << 55)
+#define PM_MMAP_EXCLUSIVE (1ULL << 56)
+#define PM_FILE (1ULL << 61)
+#define PM_SWAP (1ULL << 62)
+#define PM_PRESENT (1ULL << 63)
+
+/*
+ * kernel page flags
+ */
+
+#define KPF_BYTES 8
+#define PROC_KPAGEFLAGS "/proc/kpageflags"
+#define PROC_KPAGECOUNT "/proc/kpagecount"
+#define PROC_KPAGECGROUP "/proc/kpagecgroup"
+
+#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
+
+/* [32-] kernel hacking assistances */
+#define KPF_RESERVED 32
+#define KPF_MLOCKED 33
+#define KPF_OWNER_2 34
+#define KPF_PRIVATE 35
+#define KPF_PRIVATE_2 36
+#define KPF_OWNER_PRIVATE 37
+#define KPF_ARCH 38
+#define KPF_UNCACHED 39 /* unused */
+#define KPF_SOFTDIRTY 40
+#define KPF_ARCH_2 41
+
+/* [47-] take some arbitrary free slots for expanding overloaded flags
+ * not part of kernel API
+ */
+#define KPF_ANON_EXCLUSIVE 47
+#define KPF_READAHEAD 48
+#define KPF_SLUB_FROZEN 50
+#define KPF_SLUB_DEBUG 51
+#define KPF_FILE 61
+#define KPF_SWAP 62
+#define KPF_MMAP_EXCLUSIVE 63
+
+#define KPF_ALL_BITS ((uint64_t)~0ULL)
+#define KPF_HACKERS_BITS (0xffffULL << 32)
+#define KPF_OVERLOADED_BITS (0xffffULL << 48)
+#define BIT(name) (1ULL << KPF_##name)
+#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
+
+static const char * const page_flag_names[] = {
+ [KPF_LOCKED] = "L:locked",
+ [KPF_ERROR] = "E:error",
+ [KPF_REFERENCED] = "R:referenced",
+ [KPF_UPTODATE] = "U:uptodate",
+ [KPF_DIRTY] = "D:dirty",
+ [KPF_LRU] = "l:lru",
+ [KPF_ACTIVE] = "A:active",
+ [KPF_SLAB] = "S:slab",
+ [KPF_WRITEBACK] = "W:writeback",
+ [KPF_RECLAIM] = "I:reclaim",
+ [KPF_BUDDY] = "B:buddy",
+
+ [KPF_MMAP] = "M:mmap",
+ [KPF_ANON] = "a:anonymous",
+ [KPF_SWAPCACHE] = "s:swapcache",
+ [KPF_SWAPBACKED] = "b:swapbacked",
+ [KPF_COMPOUND_HEAD] = "H:compound_head",
+ [KPF_COMPOUND_TAIL] = "T:compound_tail",
+ [KPF_HUGE] = "G:huge",
+ [KPF_UNEVICTABLE] = "u:unevictable",
+ [KPF_HWPOISON] = "X:hwpoison",
+ [KPF_NOPAGE] = "n:nopage",
+ [KPF_KSM] = "x:ksm",
+ [KPF_THP] = "t:thp",
+ [KPF_OFFLINE] = "o:offline",
+ [KPF_PGTABLE] = "g:pgtable",
+ [KPF_ZERO_PAGE] = "z:zero_page",
+ [KPF_IDLE] = "i:idle_page",
+
+ [KPF_RESERVED] = "r:reserved",
+ [KPF_MLOCKED] = "m:mlocked",
+ [KPF_OWNER_2] = "d:owner_2",
+ [KPF_PRIVATE] = "P:private",
+ [KPF_PRIVATE_2] = "p:private_2",
+ [KPF_OWNER_PRIVATE] = "O:owner_private",
+ [KPF_ARCH] = "h:arch",
+ [KPF_SOFTDIRTY] = "f:softdirty",
+ [KPF_ARCH_2] = "H:arch_2",
+
+ [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive",
+ [KPF_READAHEAD] = "I:readahead",
+ [KPF_SLUB_FROZEN] = "A:slub_frozen",
+ [KPF_SLUB_DEBUG] = "E:slub_debug",
+
+ [KPF_FILE] = "F:file",
+ [KPF_SWAP] = "w:swap",
+ [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
+};
+
+
+/*
+ * data structures
+ */
+
+static int opt_raw; /* for kernel developers */
+static int opt_list; /* list pages (in ranges) */
+static int opt_mark_idle; /* set accessed bit */
+static int opt_no_summary; /* don't show summary */
+static pid_t opt_pid; /* process to walk */
+const char *opt_file; /* file or directory path */
+static uint64_t opt_cgroup; /* cgroup inode */
+static int opt_list_cgroup;/* list page cgroup */
+static int opt_list_mapcnt;/* list page map count */
+static const char *opt_kpageflags;/* kpageflags file to parse */
+
+#define MAX_ADDR_RANGES 1024
+static int nr_addr_ranges;
+static unsigned long opt_offset[MAX_ADDR_RANGES];
+static unsigned long opt_size[MAX_ADDR_RANGES];
+
+#define MAX_VMAS 10240
+static int nr_vmas;
+static unsigned long pg_start[MAX_VMAS];
+static unsigned long pg_end[MAX_VMAS];
+
+#define MAX_BIT_FILTERS 64
+static int nr_bit_filters;
+static uint64_t opt_mask[MAX_BIT_FILTERS];
+static uint64_t opt_bits[MAX_BIT_FILTERS];
+
+static int page_size;
+
+static int pagemap_fd;
+static int kpageflags_fd;
+static int kpagecount_fd = -1;
+static int kpagecgroup_fd = -1;
+static int page_idle_fd = -1;
+
+static int opt_hwpoison;
+static int opt_unpoison;
+
+static const char *hwpoison_debug_fs;
+static int hwpoison_inject_fd;
+static int hwpoison_forget_fd;
+
+#define HASH_SHIFT 13
+#define HASH_SIZE (1 << HASH_SHIFT)
+#define HASH_MASK (HASH_SIZE - 1)
+#define HASH_KEY(flags) (flags & HASH_MASK)
+
+static unsigned long total_pages;
+static unsigned long nr_pages[HASH_SIZE];
+static uint64_t page_flags[HASH_SIZE];
+
+
+/*
+ * helper functions
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({ \
+ type __min1 = (x); \
+ type __min2 = (y); \
+ __min1 < __min2 ? __min1 : __min2; })
+
+#define max_t(type, x, y) ({ \
+ type __max1 = (x); \
+ type __max2 = (y); \
+ __max1 > __max2 ? __max1 : __max2; })
+
+static unsigned long pages2mb(unsigned long pages)
+{
+ return (pages * page_size) >> 20;
+}
+
+static void fatal(const char *x, ...)
+{
+ va_list ap;
+
+ va_start(ap, x);
+ vfprintf(stderr, x, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static int checked_open(const char *pathname, int flags)
+{
+ int fd = open(pathname, flags);
+
+ if (fd < 0) {
+ perror(pathname);
+ exit(EXIT_FAILURE);
+ }
+
+ return fd;
+}
+
+/*
+ * pagemap/kpageflags routines
+ */
+
+static unsigned long do_u64_read(int fd, const char *name,
+ uint64_t *buf,
+ unsigned long index,
+ unsigned long count)
+{
+ long bytes;
+
+ if (index > ULONG_MAX / 8)
+ fatal("index overflow: %lu\n", index);
+
+ bytes = pread(fd, buf, count * 8, (off_t)index * 8);
+ if (bytes < 0) {
+ perror(name);
+ exit(EXIT_FAILURE);
+ }
+ if (bytes % 8)
+ fatal("partial read: %lu bytes\n", bytes);
+
+ return bytes / 8;
+}
+
+static unsigned long kpageflags_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
+}
+
+static unsigned long kpagecgroup_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ if (kpagecgroup_fd < 0)
+ return pages;
+
+ return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
+}
+
+static unsigned long kpagecount_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ return kpagecount_fd < 0 ? pages :
+ do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
+ buf, index, pages);
+}
+
+static unsigned long pagemap_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
+}
+
+static unsigned long pagemap_pfn(uint64_t val)
+{
+ unsigned long pfn;
+
+ if (val & PM_PRESENT)
+ pfn = PM_PFRAME(val);
+ else
+ pfn = 0;
+
+ return pfn;
+}
+
+static unsigned long pagemap_swap_offset(uint64_t val)
+{
+ return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
+}
+
+/*
+ * page flag names
+ */
+
+static char *page_flag_name(uint64_t flags)
+{
+ static char buf[65];
+ int present;
+ size_t i, j;
+
+ for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+ present = (flags >> i) & 1;
+ if (!page_flag_names[i]) {
+ if (present)
+ fatal("unknown flag bit %d\n", i);
+ continue;
+ }
+ buf[j++] = present ? page_flag_names[i][0] : '_';
+ }
+
+ return buf;
+}
+
+static char *page_flag_longname(uint64_t flags)
+{
+ static char buf[1024];
+ size_t i, n;
+
+ for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+ if (!page_flag_names[i])
+ continue;
+ if ((flags >> i) & 1)
+ n += snprintf(buf + n, sizeof(buf) - n, "%s,",
+ page_flag_names[i] + 2);
+ }
+ if (n)
+ n--;
+ buf[n] = '\0';
+
+ return buf;
+}
+
+
+/*
+ * page list and summary
+ */
+
+static void show_page_range(unsigned long voffset, unsigned long offset,
+ unsigned long size, uint64_t flags,
+ uint64_t cgroup, uint64_t mapcnt)
+{
+ static uint64_t flags0;
+ static uint64_t cgroup0;
+ static uint64_t mapcnt0;
+ static unsigned long voff;
+ static unsigned long index;
+ static unsigned long count;
+
+ if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
+ offset == index + count && size && voffset == voff + count) {
+ count += size;
+ return;
+ }
+
+ if (count) {
+ if (opt_pid)
+ printf("%lx\t", voff);
+ if (opt_file)
+ printf("%lx\t", voff);
+ if (opt_list_cgroup)
+ printf("@%" PRIu64 "\t", cgroup0);
+ if (opt_list_mapcnt)
+ printf("%" PRIu64 "\t", mapcnt0);
+ printf("%lx\t%lx\t%s\n",
+ index, count, page_flag_name(flags0));
+ }
+
+ flags0 = flags;
+ cgroup0 = cgroup;
+ mapcnt0 = mapcnt;
+ index = offset;
+ voff = voffset;
+ count = size;
+}
+
+static void flush_page_range(void)
+{
+ show_page_range(0, 0, 0, 0, 0, 0);
+}
+
+static void show_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
+{
+ if (opt_pid)
+ printf("%lx\t", voffset);
+ if (opt_file)
+ printf("%lx\t", voffset);
+ if (opt_list_cgroup)
+ printf("@%" PRIu64 "\t", cgroup);
+ if (opt_list_mapcnt)
+ printf("%" PRIu64 "\t", mapcnt);
+
+ printf("%lx\t%s\n", offset, page_flag_name(flags));
+}
+
+static void show_summary(void)
+{
+ size_t i;
+
+ printf(" flags\tpage-count MB"
+ " symbolic-flags\t\t\tlong-symbolic-flags\n");
+
+ for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
+ if (nr_pages[i])
+ printf("0x%016llx\t%10lu %8lu %s\t%s\n",
+ (unsigned long long)page_flags[i],
+ nr_pages[i],
+ pages2mb(nr_pages[i]),
+ page_flag_name(page_flags[i]),
+ page_flag_longname(page_flags[i]));
+ }
+
+ printf(" total\t%10lu %8lu\n",
+ total_pages, pages2mb(total_pages));
+}
+
+
+/*
+ * page flag filters
+ */
+
+static int bit_mask_ok(uint64_t flags)
+{
+ int i;
+
+ for (i = 0; i < nr_bit_filters; i++) {
+ if (opt_bits[i] == KPF_ALL_BITS) {
+ if ((flags & opt_mask[i]) == 0)
+ return 0;
+ } else {
+ if ((flags & opt_mask[i]) != opt_bits[i])
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
+{
+ /* Anonymous pages use PG_owner_2 for anon_exclusive */
+ if ((flags & BIT(ANON)) && (flags & BIT(OWNER_2)))
+ flags ^= BIT(OWNER_2) | BIT(ANON_EXCLUSIVE);
+
+ /* SLUB overloads several page flags */
+ if (flags & BIT(SLAB)) {
+ if (flags & BIT(ACTIVE))
+ flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
+ if (flags & BIT(ERROR))
+ flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
+ }
+
+ /* PG_reclaim is overloaded as PG_readahead in the read path */
+ if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
+ flags ^= BIT(RECLAIM) | BIT(READAHEAD);
+
+ if (pme & PM_SOFT_DIRTY)
+ flags |= BIT(SOFTDIRTY);
+ if (pme & PM_FILE)
+ flags |= BIT(FILE);
+ if (pme & PM_SWAP)
+ flags |= BIT(SWAP);
+ if (pme & PM_MMAP_EXCLUSIVE)
+ flags |= BIT(MMAP_EXCLUSIVE);
+
+ return flags;
+}
+
+static uint64_t well_known_flags(uint64_t flags)
+{
+ /* hide flags intended only for kernel hacker */
+ flags &= ~KPF_HACKERS_BITS;
+
+ /* hide non-hugeTLB compound pages */
+ if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
+ flags &= ~BITS_COMPOUND;
+
+ return flags;
+}
+
+static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
+{
+ if (opt_raw)
+ flags = expand_overloaded_flags(flags, pme);
+ else
+ flags = well_known_flags(flags);
+
+ return flags;
+}
+
+/*
+ * page actions
+ */
+
+static void prepare_hwpoison_fd(void)
+{
+ char buf[MAX_PATH + 1];
+
+ hwpoison_debug_fs = debugfs__mount();
+ if (!hwpoison_debug_fs) {
+ perror("mount debugfs");
+ exit(EXIT_FAILURE);
+ }
+
+ if (opt_hwpoison && !hwpoison_inject_fd) {
+ snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
+ hwpoison_debug_fs);
+ hwpoison_inject_fd = checked_open(buf, O_WRONLY);
+ }
+
+ if (opt_unpoison && !hwpoison_forget_fd) {
+ snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
+ hwpoison_debug_fs);
+ hwpoison_forget_fd = checked_open(buf, O_WRONLY);
+ }
+}
+
+static int hwpoison_page(unsigned long offset)
+{
+ char buf[100];
+ int len;
+
+ len = sprintf(buf, "0x%lx\n", offset);
+ len = write(hwpoison_inject_fd, buf, len);
+ if (len < 0) {
+ perror("hwpoison inject");
+ return len;
+ }
+ return 0;
+}
+
+static int unpoison_page(unsigned long offset)
+{
+ char buf[100];
+ int len;
+
+ len = sprintf(buf, "0x%lx\n", offset);
+ len = write(hwpoison_forget_fd, buf, len);
+ if (len < 0) {
+ perror("hwpoison forget");
+ return len;
+ }
+ return 0;
+}
+
+static int mark_page_idle(unsigned long offset)
+{
+ static unsigned long off;
+ static uint64_t buf;
+ int len;
+
+ if ((offset / 64 == off / 64) || buf == 0) {
+ buf |= 1UL << (offset % 64);
+ off = offset;
+ return 0;
+ }
+
+ len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
+ if (len < 0) {
+ perror("mark page idle");
+ return len;
+ }
+
+ buf = 1UL << (offset % 64);
+ off = offset;
+
+ return 0;
+}
+
+/*
+ * page frame walker
+ */
+
+static size_t hash_slot(uint64_t flags)
+{
+ size_t k = HASH_KEY(flags);
+ size_t i;
+
+ /* Explicitly reserve slot 0 for flags 0: the following logic
+ * cannot distinguish an unoccupied slot from slot (flags==0).
+ */
+ if (flags == 0)
+ return 0;
+
+ /* search through the remaining (HASH_SIZE-1) slots */
+ for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
+ if (!k || k >= ARRAY_SIZE(page_flags))
+ k = 1;
+ if (page_flags[k] == 0) {
+ page_flags[k] = flags;
+ return k;
+ }
+ if (page_flags[k] == flags)
+ return k;
+ }
+
+ fatal("hash table full: bump up HASH_SHIFT?\n");
+ exit(EXIT_FAILURE);
+}
+
+static void add_page(unsigned long voffset, unsigned long offset,
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
+ uint64_t pme)
+{
+ flags = kpageflags_flags(flags, pme);
+
+ if (!bit_mask_ok(flags))
+ return;
+
+ if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
+ return;
+
+ if (opt_hwpoison)
+ hwpoison_page(offset);
+ if (opt_unpoison)
+ unpoison_page(offset);
+
+ if (opt_mark_idle)
+ mark_page_idle(offset);
+
+ if (opt_list == 1)
+ show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
+ else if (opt_list == 2)
+ show_page(voffset, offset, flags, cgroup, mapcnt);
+
+ nr_pages[hash_slot(flags)]++;
+ total_pages++;
+}
+
+#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */
+static void walk_pfn(unsigned long voffset,
+ unsigned long index,
+ unsigned long count,
+ uint64_t pme)
+{
+ uint64_t buf[KPAGEFLAGS_BATCH];
+ uint64_t cgi[KPAGEFLAGS_BATCH];
+ uint64_t cnt[KPAGEFLAGS_BATCH];
+ unsigned long batch;
+ unsigned long pages;
+ unsigned long i;
+
+ /*
+ * kpagecgroup_read() reads only if kpagecgroup were opened, but
+ * /proc/kpagecgroup might even not exist, so it's better to fill
+ * them with zeros here.
+ */
+ if (count == 1)
+ cgi[0] = 0;
+ else
+ memset(cgi, 0, sizeof cgi);
+
+ while (count) {
+ batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
+ pages = kpageflags_read(buf, index, batch);
+ if (pages == 0)
+ break;
+
+ if (kpagecgroup_read(cgi, index, pages) != pages)
+ fatal("kpagecgroup returned fewer pages than expected");
+
+ if (kpagecount_read(cnt, index, pages) != pages)
+ fatal("kpagecount returned fewer pages than expected");
+
+ for (i = 0; i < pages; i++)
+ add_page(voffset + i, index + i,
+ buf[i], cgi[i], cnt[i], pme);
+
+ index += pages;
+ count -= pages;
+ }
+}
+
+static void walk_swap(unsigned long voffset, uint64_t pme)
+{
+ uint64_t flags = kpageflags_flags(0, pme);
+
+ if (!bit_mask_ok(flags))
+ return;
+
+ if (opt_cgroup)
+ return;
+
+ if (opt_list == 1)
+ show_page_range(voffset, pagemap_swap_offset(pme),
+ 1, flags, 0, 0);
+ else if (opt_list == 2)
+ show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
+
+ nr_pages[hash_slot(flags)]++;
+ total_pages++;
+}
+
+#define PAGEMAP_BATCH (64 << 10)
+static void walk_vma(unsigned long index, unsigned long count)
+{
+ uint64_t buf[PAGEMAP_BATCH];
+ unsigned long batch;
+ unsigned long pages;
+ unsigned long pfn;
+ unsigned long i;
+
+ while (count) {
+ batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+ pages = pagemap_read(buf, index, batch);
+ if (pages == 0)
+ break;
+
+ for (i = 0; i < pages; i++) {
+ pfn = pagemap_pfn(buf[i]);
+ if (pfn)
+ walk_pfn(index + i, pfn, 1, buf[i]);
+ if (buf[i] & PM_SWAP)
+ walk_swap(index + i, buf[i]);
+ }
+
+ index += pages;
+ count -= pages;
+ }
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+ const unsigned long end = index + count;
+ unsigned long start;
+ int i = 0;
+
+ while (index < end) {
+
+ while (pg_end[i] <= index)
+ if (++i >= nr_vmas)
+ return;
+ if (pg_start[i] >= end)
+ return;
+
+ start = max_t(unsigned long, pg_start[i], index);
+ index = min_t(unsigned long, pg_end[i], end);
+
+ assert(start < index);
+ walk_vma(start, index - start);
+ }
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+ if (nr_addr_ranges >= MAX_ADDR_RANGES)
+ fatal("too many addr ranges\n");
+
+ opt_offset[nr_addr_ranges] = offset;
+ opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+ nr_addr_ranges++;
+}
+
+static void walk_addr_ranges(void)
+{
+ int i;
+
+ kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
+
+ if (!nr_addr_ranges)
+ add_addr_range(0, ULONG_MAX);
+
+ for (i = 0; i < nr_addr_ranges; i++)
+ if (!opt_pid)
+ walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
+ else
+ walk_task(opt_offset[i], opt_size[i]);
+
+ if (opt_mark_idle)
+ mark_page_idle(0);
+
+ close(kpageflags_fd);
+}
+
+
+/*
+ * user interface
+ */
+
+static const char *page_flag_type(uint64_t flag)
+{
+ if (flag & KPF_HACKERS_BITS)
+ return "(r)";
+ if (flag & KPF_OVERLOADED_BITS)
+ return "(o)";
+ return " ";
+}
+
+static void usage(void)
+{
+ size_t i, j;
+
+ printf(
+"page-types [options]\n"
+" -r|--raw Raw mode, for kernel developers\n"
+" -d|--describe flags Describe flags\n"
+" -a|--addr addr-spec Walk a range of pages\n"
+" -b|--bits bits-spec Walk pages with specified bits\n"
+" -c|--cgroup path|@inode Walk pages within memory cgroup\n"
+" -p|--pid pid Walk process address space\n"
+" -f|--file filename Walk file address space\n"
+" -i|--mark-idle Mark pages idle\n"
+" -l|--list Show page details in ranges\n"
+" -L|--list-each Show page details one by one\n"
+" -C|--list-cgroup Show cgroup inode for pages\n"
+" -M|--list-mapcnt Show page map count\n"
+" -N|--no-summary Don't show summary info\n"
+" -X|--hwpoison hwpoison pages\n"
+" -x|--unpoison unpoison pages\n"
+" -F|--kpageflags filename kpageflags file to parse\n"
+" -h|--help Show this usage message\n"
+"flags:\n"
+" 0x10 bitfield format, e.g.\n"
+" anon bit-name, e.g.\n"
+" 0x10,anon comma-separated list, e.g.\n"
+"addr-spec:\n"
+" N one page at offset N (unit: pages)\n"
+" N+M pages range from N to N+M-1\n"
+" N,M pages range from N to M-1\n"
+" N, pages range from N to end\n"
+" ,M pages range from 0 to M-1\n"
+"bits-spec:\n"
+" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
+" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
+" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
+" =bit1,bit2 flags == (bit1|bit2)\n"
+"bit-names:\n"
+ );
+
+ for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+ if (!page_flag_names[i])
+ continue;
+ printf("%16s%s", page_flag_names[i] + 2,
+ page_flag_type(1ULL << i));
+ if (++j > 3) {
+ j = 0;
+ putchar('\n');
+ }
+ }
+ printf("\n "
+ "(r) raw mode bits (o) overloaded bits\n");
+}
+
+static unsigned long long parse_number(const char *str)
+{
+ unsigned long long n;
+
+ n = strtoll(str, NULL, 0);
+
+ if (n == 0 && str[0] != '0')
+ fatal("invalid name or number: %s\n", str);
+
+ return n;
+}
+
+static void parse_pid(const char *str)
+{
+ FILE *file;
+ char buf[5000];
+
+ opt_pid = parse_number(str);
+
+ sprintf(buf, "/proc/%d/pagemap", opt_pid);
+ pagemap_fd = checked_open(buf, O_RDONLY);
+
+ sprintf(buf, "/proc/%d/maps", opt_pid);
+ file = fopen(buf, "r");
+ if (!file) {
+ perror(buf);
+ exit(EXIT_FAILURE);
+ }
+
+ while (fgets(buf, sizeof(buf), file) != NULL) {
+ unsigned long vm_start;
+ unsigned long vm_end;
+ unsigned long long pgoff;
+ int major, minor;
+ char r, w, x, s;
+ unsigned long ino;
+ int n;
+
+ n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+ &vm_start,
+ &vm_end,
+ &r, &w, &x, &s,
+ &pgoff,
+ &major, &minor,
+ &ino);
+ if (n < 10) {
+ fprintf(stderr, "unexpected line: %s\n", buf);
+ continue;
+ }
+ pg_start[nr_vmas] = vm_start / page_size;
+ pg_end[nr_vmas] = vm_end / page_size;
+ if (++nr_vmas >= MAX_VMAS) {
+ fprintf(stderr, "too many VMAs\n");
+ break;
+ }
+ }
+ fclose(file);
+}
+
+static void show_file(const char *name, const struct stat *st)
+{
+ unsigned long long size = st->st_size;
+ char atime[64], mtime[64];
+ long now = time(NULL);
+
+ printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
+ name, (unsigned)st->st_ino,
+ size, (size + page_size - 1) / page_size);
+
+ strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
+ strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
+
+ printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
+ mtime, now - st->st_mtime,
+ atime, now - st->st_atime);
+}
+
+static sigjmp_buf sigbus_jmp;
+
+static void * volatile sigbus_addr;
+
+static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
+{
+ (void)sig;
+ (void)ucontex;
+ sigbus_addr = info ? info->si_addr : NULL;
+ siglongjmp(sigbus_jmp, 1);
+}
+
+static struct sigaction sigbus_action = {
+ .sa_sigaction = sigbus_handler,
+ .sa_flags = SA_SIGINFO,
+};
+
+static void walk_file_range(const char *name, int fd,
+ unsigned long off, unsigned long end)
+{
+ uint8_t vec[PAGEMAP_BATCH];
+ uint64_t buf[PAGEMAP_BATCH], flags;
+ uint64_t cgroup = 0;
+ uint64_t mapcnt = 0;
+ unsigned long nr_pages, pfn, i;
+ ssize_t len;
+ void *ptr;
+ int first = 1;
+
+ for (; off < end; off += len) {
+ nr_pages = (end - off + page_size - 1) / page_size;
+ if (nr_pages > PAGEMAP_BATCH)
+ nr_pages = PAGEMAP_BATCH;
+ len = nr_pages * page_size;
+
+ ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
+ if (ptr == MAP_FAILED)
+ fatal("mmap failed: %s", name);
+
+ /* determine cached pages */
+ if (mincore(ptr, len, vec))
+ fatal("mincore failed: %s", name);
+
+ /* turn off readahead */
+ if (madvise(ptr, len, MADV_RANDOM))
+ fatal("madvice failed: %s", name);
+
+ if (sigsetjmp(sigbus_jmp, 1)) {
+ end = off + sigbus_addr ? sigbus_addr - ptr : 0;
+ fprintf(stderr, "got sigbus at offset %lld: %s\n",
+ (long long)end, name);
+ goto got_sigbus;
+ }
+
+ /* populate ptes */
+ for (i = 0; i < nr_pages ; i++) {
+ if (vec[i] & 1)
+ (void)*(volatile int *)(ptr + i * page_size);
+ }
+got_sigbus:
+
+ /* turn off harvesting reference bits */
+ if (madvise(ptr, len, MADV_SEQUENTIAL))
+ fatal("madvice failed: %s", name);
+
+ if (pagemap_read(buf, (unsigned long)ptr / page_size,
+ nr_pages) != nr_pages)
+ fatal("cannot read pagemap");
+
+ munmap(ptr, len);
+
+ for (i = 0; i < nr_pages; i++) {
+ pfn = pagemap_pfn(buf[i]);
+ if (!pfn)
+ continue;
+ if (!kpageflags_read(&flags, pfn, 1))
+ continue;
+ if (!kpagecgroup_read(&cgroup, pfn, 1))
+ fatal("kpagecgroup_read failed");
+ if (!kpagecount_read(&mapcnt, pfn, 1))
+ fatal("kpagecount_read failed");
+ if (first && opt_list) {
+ first = 0;
+ flush_page_range();
+ }
+ add_page(off / page_size + i, pfn,
+ flags, cgroup, mapcnt, buf[i]);
+ }
+ }
+}
+
+static void walk_file(const char *name, const struct stat *st)
+{
+ int i;
+ int fd;
+
+ fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
+
+ if (!nr_addr_ranges)
+ add_addr_range(0, st->st_size / page_size);
+
+ for (i = 0; i < nr_addr_ranges; i++)
+ walk_file_range(name, fd, opt_offset[i] * page_size,
+ (opt_offset[i] + opt_size[i]) * page_size);
+
+ close(fd);
+}
+
+int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
+{
+ (void)f;
+ switch (type) {
+ case FTW_F:
+ if (S_ISREG(st->st_mode))
+ walk_file(name, st);
+ break;
+ case FTW_DNR:
+ fprintf(stderr, "cannot read dir: %s\n", name);
+ break;
+ }
+ return 0;
+}
+
+struct stat st;
+
+static void walk_page_cache(void)
+{
+ kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
+ pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
+ sigaction(SIGBUS, &sigbus_action, NULL);
+
+ if (stat(opt_file, &st))
+ fatal("stat failed: %s\n", opt_file);
+
+ if (S_ISREG(st.st_mode)) {
+ walk_file(opt_file, &st);
+ } else if (S_ISDIR(st.st_mode)) {
+ /* do not follow symlinks and mountpoints */
+ if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
+ fatal("nftw failed: %s\n", opt_file);
+ } else
+ fatal("unhandled file type: %s\n", opt_file);
+
+ close(kpageflags_fd);
+ close(pagemap_fd);
+ signal(SIGBUS, SIG_DFL);
+}
+
+static void parse_file(const char *name)
+{
+ opt_file = name;
+}
+
+static void parse_cgroup(const char *path)
+{
+ if (path[0] == '@') {
+ opt_cgroup = parse_number(path + 1);
+ return;
+ }
+
+ struct stat st;
+
+ if (stat(path, &st))
+ fatal("stat failed: %s: %m\n", path);
+
+ if (!S_ISDIR(st.st_mode))
+ fatal("cgroup supposed to be a directory: %s\n", path);
+
+ opt_cgroup = st.st_ino;
+}
+
+static void parse_addr_range(const char *optarg)
+{
+ unsigned long offset;
+ unsigned long size;
+ char *p;
+
+ p = strchr(optarg, ',');
+ if (!p)
+ p = strchr(optarg, '+');
+
+ if (p == optarg) {
+ offset = 0;
+ size = parse_number(p + 1);
+ } else if (p) {
+ offset = parse_number(optarg);
+ if (p[1] == '\0')
+ size = ULONG_MAX;
+ else {
+ size = parse_number(p + 1);
+ if (*p == ',') {
+ if (size < offset)
+ fatal("invalid range: %lu,%lu\n",
+ offset, size);
+ size -= offset;
+ }
+ }
+ } else {
+ offset = parse_number(optarg);
+ size = 1;
+ }
+
+ add_addr_range(offset, size);
+}
+
+static void add_bits_filter(uint64_t mask, uint64_t bits)
+{
+ if (nr_bit_filters >= MAX_BIT_FILTERS)
+ fatal("too much bit filters\n");
+
+ opt_mask[nr_bit_filters] = mask;
+ opt_bits[nr_bit_filters] = bits;
+ nr_bit_filters++;
+}
+
+static uint64_t parse_flag_name(const char *str, int len)
+{
+ size_t i;
+
+ if (!*str || !len)
+ return 0;
+
+ if (len <= 8 && !strncmp(str, "compound", len))
+ return BITS_COMPOUND;
+
+ for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+ if (!page_flag_names[i])
+ continue;
+ if (!strncmp(str, page_flag_names[i] + 2, len))
+ return 1ULL << i;
+ }
+
+ return parse_number(str);
+}
+
+static uint64_t parse_flag_names(const char *str, int all)
+{
+ const char *p = str;
+ uint64_t flags = 0;
+
+ while (1) {
+ if (*p == ',' || *p == '=' || *p == '\0') {
+ if ((*str != '~') || (*str == '~' && all && *++str))
+ flags |= parse_flag_name(str, p - str);
+ if (*p != ',')
+ break;
+ str = p + 1;
+ }
+ p++;
+ }
+
+ return flags;
+}
+
+static void parse_bits_mask(const char *optarg)
+{
+ uint64_t mask;
+ uint64_t bits;
+ const char *p;
+
+ p = strchr(optarg, '=');
+ if (p == optarg) {
+ mask = KPF_ALL_BITS;
+ bits = parse_flag_names(p + 1, 0);
+ } else if (p) {
+ mask = parse_flag_names(optarg, 0);
+ bits = parse_flag_names(p + 1, 0);
+ } else if (strchr(optarg, '~')) {
+ mask = parse_flag_names(optarg, 1);
+ bits = parse_flag_names(optarg, 0);
+ } else {
+ mask = parse_flag_names(optarg, 0);
+ bits = KPF_ALL_BITS;
+ }
+
+ add_bits_filter(mask, bits);
+}
+
+static void parse_kpageflags(const char *name)
+{
+ opt_kpageflags = name;
+}
+
+static void describe_flags(const char *optarg)
+{
+ uint64_t flags = parse_flag_names(optarg, 0);
+
+ printf("0x%016llx\t%s\t%s\n",
+ (unsigned long long)flags,
+ page_flag_name(flags),
+ page_flag_longname(flags));
+}
+
+static const struct option opts[] = {
+ { "raw" , 0, NULL, 'r' },
+ { "pid" , 1, NULL, 'p' },
+ { "file" , 1, NULL, 'f' },
+ { "addr" , 1, NULL, 'a' },
+ { "bits" , 1, NULL, 'b' },
+ { "cgroup" , 1, NULL, 'c' },
+ { "describe" , 1, NULL, 'd' },
+ { "mark-idle" , 0, NULL, 'i' },
+ { "list" , 0, NULL, 'l' },
+ { "list-each" , 0, NULL, 'L' },
+ { "list-cgroup", 0, NULL, 'C' },
+ { "list-mapcnt", 0, NULL, 'M' },
+ { "no-summary", 0, NULL, 'N' },
+ { "hwpoison" , 0, NULL, 'X' },
+ { "unpoison" , 0, NULL, 'x' },
+ { "kpageflags", 0, NULL, 'F' },
+ { "help" , 0, NULL, 'h' },
+ { NULL , 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+ int c;
+
+ page_size = getpagesize();
+
+ while ((c = getopt_long(argc, argv,
+ "rp:f:a:b:d:c:CilLMNXxF:h",
+ opts, NULL)) != -1) {
+ switch (c) {
+ case 'r':
+ opt_raw = 1;
+ break;
+ case 'p':
+ parse_pid(optarg);
+ break;
+ case 'f':
+ parse_file(optarg);
+ break;
+ case 'a':
+ parse_addr_range(optarg);
+ break;
+ case 'b':
+ parse_bits_mask(optarg);
+ break;
+ case 'c':
+ parse_cgroup(optarg);
+ break;
+ case 'C':
+ opt_list_cgroup = 1;
+ break;
+ case 'd':
+ describe_flags(optarg);
+ exit(0);
+ case 'i':
+ opt_mark_idle = 1;
+ break;
+ case 'l':
+ opt_list = 1;
+ break;
+ case 'L':
+ opt_list = 2;
+ break;
+ case 'M':
+ opt_list_mapcnt = 1;
+ break;
+ case 'N':
+ opt_no_summary = 1;
+ break;
+ case 'X':
+ opt_hwpoison = 1;
+ prepare_hwpoison_fd();
+ break;
+ case 'x':
+ opt_unpoison = 1;
+ prepare_hwpoison_fd();
+ break;
+ case 'F':
+ parse_kpageflags(optarg);
+ break;
+ case 'h':
+ usage();
+ exit(0);
+ default:
+ usage();
+ exit(1);
+ }
+ }
+
+ if (!opt_kpageflags)
+ opt_kpageflags = PROC_KPAGEFLAGS;
+
+ if (opt_cgroup || opt_list_cgroup)
+ kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+
+ if (opt_list && opt_list_mapcnt)
+ kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
+
+ if (opt_mark_idle)
+ page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
+
+ if (opt_list && opt_pid)
+ printf("voffset\t");
+ if (opt_list && opt_file)
+ printf("foffset\t");
+ if (opt_list && opt_list_cgroup)
+ printf("cgroup\t");
+ if (opt_list && opt_list_mapcnt)
+ printf("map-cnt\t");
+
+ if (opt_list == 1)
+ printf("offset\tlen\tflags\n");
+ if (opt_list == 2)
+ printf("offset\tflags\n");
+
+ if (opt_file)
+ walk_page_cache();
+ else
+ walk_addr_ranges();
+
+ if (opt_list == 1)
+ flush_page_range();
+
+ if (opt_no_summary)
+ return 0;
+
+ if (opt_list)
+ printf("\n\n");
+
+ if (opt_file) {
+ show_file(opt_file, &st);
+ printf("\n");
+ }
+
+ show_summary();
+
+ if (opt_list_mapcnt)
+ close(kpagecount_fd);
+
+ if (page_idle_fd >= 0)
+ close(page_idle_fd);
+
+ return 0;
+}
diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c
new file mode 100644
index 000000000000..14c67e9e84c4
--- /dev/null
+++ b/tools/mm/page_owner_sort.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User-space helper to sort the output of /sys/kernel/debug/page_owner
+ *
+ * Example use:
+ * cat /sys/kernel/debug/page_owner > page_owner_full.txt
+ * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+ * Or sort by total memory:
+ * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
+ *
+ * See Documentation/mm/page_owner.rst
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <regex.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <getopt.h>
+
+#define TASK_COMM_LEN 16
+
+struct block_list {
+ char *txt;
+ char *comm; // task command name
+ char *stacktrace;
+ __u64 ts_nsec;
+ int len;
+ int num;
+ int page_num;
+ pid_t pid;
+ pid_t tgid;
+ int allocator;
+};
+enum FILTER_BIT {
+ FILTER_PID = 1<<1,
+ FILTER_TGID = 1<<2,
+ FILTER_COMM = 1<<3
+};
+enum CULL_BIT {
+ CULL_PID = 1<<1,
+ CULL_TGID = 1<<2,
+ CULL_COMM = 1<<3,
+ CULL_STACKTRACE = 1<<4,
+ CULL_ALLOCATOR = 1<<5
+};
+enum ALLOCATOR_BIT {
+ ALLOCATOR_CMA = 1<<1,
+ ALLOCATOR_SLAB = 1<<2,
+ ALLOCATOR_VMALLOC = 1<<3,
+ ALLOCATOR_OTHERS = 1<<4
+};
+enum ARG_TYPE {
+ ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_CULL_TIME,
+ ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_ALLOCATOR
+};
+enum SORT_ORDER {
+ SORT_ASC = 1,
+ SORT_DESC = -1,
+};
+enum COMP_FLAG {
+ COMP_NO_FLAG = 0,
+ COMP_ALLOC = 1<<0,
+ COMP_PAGE_NUM = 1<<1,
+ COMP_PID = 1<<2,
+ COMP_STACK = 1<<3,
+ COMP_NUM = 1<<4,
+ COMP_TGID = 1<<5,
+ COMP_COMM = 1<<6
+};
+struct filter_condition {
+ pid_t *pids;
+ pid_t *tgids;
+ char **comms;
+ int pids_size;
+ int tgids_size;
+ int comms_size;
+};
+struct sort_condition {
+ int (**cmps)(const void *, const void *);
+ int *signs;
+ int size;
+};
+static struct filter_condition fc;
+static struct sort_condition sc;
+static regex_t order_pattern;
+static regex_t pid_pattern;
+static regex_t tgid_pattern;
+static regex_t comm_pattern;
+static regex_t ts_nsec_pattern;
+static struct block_list *list;
+static int list_size;
+static int max_size;
+static int cull;
+static int filter;
+static bool debug_on;
+
+static void set_single_cmp(int (*cmp)(const void *, const void *), int sign);
+
+int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin)
+{
+ char *curr = buf, *const buf_end = buf + buf_size;
+
+ while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) {
+ if (*curr == '\n') { /* empty line */
+ return curr - buf;
+ }
+ if (!strncmp(curr, "PFN", 3)) {
+ strcpy(ext_buf, curr);
+ continue;
+ }
+ curr += strlen(curr);
+ }
+
+ return -1; /* EOF or no space left in buf. */
+}
+
+static int compare_txt(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return strcmp(l1->txt, l2->txt);
+}
+
+static int compare_stacktrace(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return strcmp(l1->stacktrace, l2->stacktrace);
+}
+
+static int compare_num(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->num - l2->num;
+}
+
+static int compare_page_num(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->page_num - l2->page_num;
+}
+
+static int compare_pid(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->pid - l2->pid;
+}
+
+static int compare_tgid(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->tgid - l2->tgid;
+}
+
+static int compare_allocator(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->allocator - l2->allocator;
+}
+
+static int compare_comm(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return strcmp(l1->comm, l2->comm);
+}
+
+static int compare_ts(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
+}
+
+static int compare_cull_condition(const void *p1, const void *p2)
+{
+ if (cull == 0)
+ return compare_txt(p1, p2);
+ if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2))
+ return compare_stacktrace(p1, p2);
+ if ((cull & CULL_PID) && compare_pid(p1, p2))
+ return compare_pid(p1, p2);
+ if ((cull & CULL_TGID) && compare_tgid(p1, p2))
+ return compare_tgid(p1, p2);
+ if ((cull & CULL_COMM) && compare_comm(p1, p2))
+ return compare_comm(p1, p2);
+ if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2))
+ return compare_allocator(p1, p2);
+ return 0;
+}
+
+static int compare_sort_condition(const void *p1, const void *p2)
+{
+ int cmp = 0;
+
+ for (int i = 0; i < sc.size; ++i)
+ if (cmp == 0)
+ cmp = sc.signs[i] * sc.cmps[i](p1, p2);
+ return cmp;
+}
+
+static int remove_pattern(regex_t *pattern, char *buf, int len)
+{
+ regmatch_t pmatch[2];
+ int err;
+
+ err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
+ if (err != 0 || pmatch[1].rm_so == -1)
+ return len;
+
+ memcpy(buf + pmatch[1].rm_so,
+ buf + pmatch[1].rm_eo, len - pmatch[1].rm_eo);
+
+ return len - (pmatch[1].rm_eo - pmatch[1].rm_so);
+}
+
+static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
+{
+ int err, val_len;
+ regmatch_t pmatch[2];
+
+ err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
+ if (err != 0 || pmatch[1].rm_so == -1) {
+ if (debug_on)
+ fprintf(stderr, "no matching pattern in %s\n", buf);
+ return -1;
+ }
+ val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
+
+ memcpy(pattern_str, buf + pmatch[1].rm_so, val_len);
+
+ return 0;
+}
+
+static bool check_regcomp(regex_t *pattern, const char *regex)
+{
+ int err;
+
+ err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
+ if (err != 0 || pattern->re_nsub != 1) {
+ fprintf(stderr, "Invalid pattern %s code %d\n", regex, err);
+ return false;
+ }
+ return true;
+}
+
+static char **explode(char sep, const char *str, int *size)
+{
+ int count = 0, len = strlen(str);
+ int lastindex = -1, j = 0;
+
+ for (int i = 0; i < len; i++)
+ if (str[i] == sep)
+ count++;
+ char **ret = calloc(++count, sizeof(char *));
+
+ for (int i = 0; i < len; i++) {
+ if (str[i] == sep) {
+ ret[j] = calloc(i - lastindex, sizeof(char));
+ memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1);
+ lastindex = i;
+ }
+ }
+ if (lastindex <= len - 1) {
+ ret[j] = calloc(len - lastindex, sizeof(char));
+ memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex);
+ }
+ *size = j;
+ return ret;
+}
+
+static void free_explode(char **arr, int size)
+{
+ for (int i = 0; i < size; i++)
+ free(arr[i]);
+ free(arr);
+}
+
+# define FIELD_BUFF 25
+
+static int get_page_num(char *buf)
+{
+ int order_val;
+ char order_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&order_pattern, order_str, buf);
+ errno = 0;
+ order_val = strtol(order_str, &endptr, 10);
+ if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') {
+ if (debug_on)
+ fprintf(stderr, "wrong order in follow buf:\n%s\n", buf);
+ return 0;
+ }
+
+ return 1 << order_val;
+}
+
+static pid_t get_pid(char *buf)
+{
+ pid_t pid;
+ char pid_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&pid_pattern, pid_str, buf);
+ errno = 0;
+ pid = strtol(pid_str, &endptr, 10);
+ if (errno != 0 || endptr == pid_str || *endptr != '\0') {
+ if (debug_on)
+ fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return pid;
+
+}
+
+static pid_t get_tgid(char *buf)
+{
+ pid_t tgid;
+ char tgid_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&tgid_pattern, tgid_str, buf);
+ errno = 0;
+ tgid = strtol(tgid_str, &endptr, 10);
+ if (errno != 0 || endptr == tgid_str || *endptr != '\0') {
+ if (debug_on)
+ fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return tgid;
+
+}
+
+static __u64 get_ts_nsec(char *buf)
+{
+ __u64 ts_nsec;
+ char ts_nsec_str[FIELD_BUFF] = {0};
+ char *endptr;
+
+ search_pattern(&ts_nsec_pattern, ts_nsec_str, buf);
+ errno = 0;
+ ts_nsec = strtoull(ts_nsec_str, &endptr, 10);
+ if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') {
+ if (debug_on)
+ fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf);
+ return -1;
+ }
+
+ return ts_nsec;
+}
+
+static char *get_comm(char *buf)
+{
+ char *comm_str = malloc(TASK_COMM_LEN);
+
+ memset(comm_str, 0, TASK_COMM_LEN);
+
+ search_pattern(&comm_pattern, comm_str, buf);
+ errno = 0;
+ if (errno != 0) {
+ if (debug_on)
+ fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf);
+ free(comm_str);
+ return NULL;
+ }
+
+ return comm_str;
+}
+
+static int get_arg_type(const char *arg)
+{
+ if (!strcmp(arg, "pid") || !strcmp(arg, "p"))
+ return ARG_PID;
+ else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg"))
+ return ARG_TGID;
+ else if (!strcmp(arg, "name") || !strcmp(arg, "n"))
+ return ARG_COMM;
+ else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st"))
+ return ARG_STACKTRACE;
+ else if (!strcmp(arg, "txt") || !strcmp(arg, "T"))
+ return ARG_TXT;
+ else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at"))
+ return ARG_ALLOC_TS;
+ else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator"))
+ return ARG_ALLOCATOR;
+ else {
+ return ARG_UNKNOWN;
+ }
+}
+
+static int get_allocator(const char *buf, const char *migrate_info)
+{
+ char *tmp, *first_line, *second_line;
+ int allocator = 0;
+
+ if (strstr(migrate_info, "CMA"))
+ allocator |= ALLOCATOR_CMA;
+ if (strstr(migrate_info, "slab"))
+ allocator |= ALLOCATOR_SLAB;
+ tmp = strstr(buf, "__vmalloc_node_range");
+ if (tmp) {
+ second_line = tmp;
+ while (*tmp != '\n')
+ tmp--;
+ tmp--;
+ while (*tmp != '\n')
+ tmp--;
+ first_line = ++tmp;
+ tmp = strstr(tmp, "alloc_pages");
+ if (tmp && first_line <= tmp && tmp < second_line)
+ allocator |= ALLOCATOR_VMALLOC;
+ }
+ if (allocator == 0)
+ allocator = ALLOCATOR_OTHERS;
+ return allocator;
+}
+
+static bool match_num_list(int num, int *list, int list_size)
+{
+ for (int i = 0; i < list_size; ++i)
+ if (list[i] == num)
+ return true;
+ return false;
+}
+
+static bool match_str_list(const char *str, char **list, int list_size)
+{
+ for (int i = 0; i < list_size; ++i)
+ if (!strcmp(list[i], str))
+ return true;
+ return false;
+}
+
+static bool is_need(char *buf)
+{
+ if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
+ return false;
+ if ((filter & FILTER_TGID) &&
+ !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size))
+ return false;
+
+ char *comm = get_comm(buf);
+
+ if ((filter & FILTER_COMM) &&
+ !match_str_list(comm, fc.comms, fc.comms_size)) {
+ free(comm);
+ return false;
+ }
+ free(comm);
+ return true;
+}
+
+static bool add_list(char *buf, int len, char *ext_buf)
+{
+ if (list_size == max_size) {
+ fprintf(stderr, "max_size too small??\n");
+ return false;
+ }
+ if (!is_need(buf))
+ return true;
+ list[list_size].pid = get_pid(buf);
+ list[list_size].tgid = get_tgid(buf);
+ list[list_size].comm = get_comm(buf);
+ list[list_size].txt = malloc(len+1);
+ if (!list[list_size].txt) {
+ fprintf(stderr, "Out of memory\n");
+ return false;
+ }
+ memcpy(list[list_size].txt, buf, len);
+ if (sc.cmps[0] != compare_ts) {
+ len = remove_pattern(&ts_nsec_pattern, list[list_size].txt, len);
+ }
+ list[list_size].txt[len] = 0;
+ list[list_size].len = len;
+ list[list_size].num = 1;
+ list[list_size].page_num = get_page_num(buf);
+
+ list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: "";
+ if (*list[list_size].stacktrace == '\n')
+ list[list_size].stacktrace++;
+ list[list_size].ts_nsec = get_ts_nsec(buf);
+ list[list_size].allocator = get_allocator(buf, ext_buf);
+ list_size++;
+ if (list_size % 1000 == 0) {
+ printf("loaded %d\r", list_size);
+ fflush(stdout);
+ }
+ return true;
+}
+
+static bool parse_cull_args(const char *arg_str)
+{
+ int size = 0;
+ char **args = explode(',', arg_str, &size);
+
+ for (int i = 0; i < size; ++i) {
+ int arg_type = get_arg_type(args[i]);
+
+ if (arg_type == ARG_PID)
+ cull |= CULL_PID;
+ else if (arg_type == ARG_TGID)
+ cull |= CULL_TGID;
+ else if (arg_type == ARG_COMM)
+ cull |= CULL_COMM;
+ else if (arg_type == ARG_STACKTRACE)
+ cull |= CULL_STACKTRACE;
+ else if (arg_type == ARG_ALLOCATOR)
+ cull |= CULL_ALLOCATOR;
+ else {
+ free_explode(args, size);
+ return false;
+ }
+ }
+ free_explode(args, size);
+ if (sc.size == 0)
+ set_single_cmp(compare_num, SORT_DESC);
+ return true;
+}
+
+static void set_single_cmp(int (*cmp)(const void *, const void *), int sign)
+{
+ if (sc.signs == NULL || sc.size < 1)
+ sc.signs = calloc(1, sizeof(int));
+ sc.signs[0] = sign;
+ if (sc.cmps == NULL || sc.size < 1)
+ sc.cmps = calloc(1, sizeof(int *));
+ sc.cmps[0] = cmp;
+ sc.size = 1;
+}
+
+static bool parse_sort_args(const char *arg_str)
+{
+ int size = 0;
+
+ if (sc.size != 0) { /* reset sort_condition */
+ free(sc.signs);
+ free(sc.cmps);
+ size = 0;
+ }
+
+ char **args = explode(',', arg_str, &size);
+
+ sc.signs = calloc(size, sizeof(int));
+ sc.cmps = calloc(size, sizeof(int *));
+ for (int i = 0; i < size; ++i) {
+ int offset = 0;
+
+ sc.signs[i] = SORT_ASC;
+ if (args[i][0] == '-' || args[i][0] == '+') {
+ if (args[i][0] == '-')
+ sc.signs[i] = SORT_DESC;
+ offset = 1;
+ }
+
+ int arg_type = get_arg_type(args[i]+offset);
+
+ if (arg_type == ARG_PID)
+ sc.cmps[i] = compare_pid;
+ else if (arg_type == ARG_TGID)
+ sc.cmps[i] = compare_tgid;
+ else if (arg_type == ARG_COMM)
+ sc.cmps[i] = compare_comm;
+ else if (arg_type == ARG_STACKTRACE)
+ sc.cmps[i] = compare_stacktrace;
+ else if (arg_type == ARG_ALLOC_TS)
+ sc.cmps[i] = compare_ts;
+ else if (arg_type == ARG_TXT)
+ sc.cmps[i] = compare_txt;
+ else if (arg_type == ARG_ALLOCATOR)
+ sc.cmps[i] = compare_allocator;
+ else {
+ free_explode(args, size);
+ sc.size = 0;
+ return false;
+ }
+ }
+ sc.size = size;
+ free_explode(args, size);
+ return true;
+}
+
+static int *parse_nums_list(char *arg_str, int *list_size)
+{
+ int size = 0;
+ char **args = explode(',', arg_str, &size);
+ int *list = calloc(size, sizeof(int));
+
+ errno = 0;
+ for (int i = 0; i < size; ++i) {
+ char *endptr = NULL;
+
+ list[i] = strtol(args[i], &endptr, 10);
+ if (errno != 0 || endptr == args[i] || *endptr != '\0') {
+ free(list);
+ return NULL;
+ }
+ }
+ *list_size = size;
+ free_explode(args, size);
+ return list;
+}
+
+static void print_allocator(FILE *out, int allocator)
+{
+ fprintf(out, "allocated by ");
+ if (allocator & ALLOCATOR_CMA)
+ fprintf(out, "CMA ");
+ if (allocator & ALLOCATOR_SLAB)
+ fprintf(out, "SLAB ");
+ if (allocator & ALLOCATOR_VMALLOC)
+ fprintf(out, "VMALLOC ");
+ if (allocator & ALLOCATOR_OTHERS)
+ fprintf(out, "OTHERS ");
+}
+
+#define BUF_SIZE (128 * 1024)
+
+static void usage(void)
+{
+ printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
+ "-a\t\t\tSort by memory allocation time.\n"
+ "-m\t\t\tSort by total memory.\n"
+ "-n\t\t\tSort by task command name.\n"
+ "-p\t\t\tSort by pid.\n"
+ "-P\t\t\tSort by tgid.\n"
+ "-s\t\t\tSort by the stacktrace.\n"
+ "-t\t\t\tSort by number of times record is seen (default).\n\n"
+ "--pid <pidlist>\t\tSelect by pid. This selects the information"
+ " of\n\t\t\tblocks whose process ID numbers appear in <pidlist>.\n"
+ "--tgid <tgidlist>\tSelect by tgid. This selects the information"
+ " of\n\t\t\tblocks whose Thread Group ID numbers appear in "
+ "<tgidlist>.\n"
+ "--name <cmdlist>\tSelect by command name. This selects the"
+ " information\n\t\t\tof blocks whose command name appears in"
+ " <cmdlist>.\n"
+ "--cull <rules>\t\tCull by user-defined rules. <rules> is a "
+ "single\n\t\t\targument in the form of a comma-separated list "
+ "with some\n\t\t\tcommon fields predefined (pid, tgid, comm, "
+ "stacktrace, allocator)\n"
+ "--sort <order>\t\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n"
+ );
+}
+
+int main(int argc, char **argv)
+{
+ FILE *fin, *fout;
+ char *buf, *ext_buf;
+ int i, count, compare_flag;
+ struct stat st;
+ int opt;
+ struct option longopts[] = {
+ { "pid", required_argument, NULL, 1 },
+ { "tgid", required_argument, NULL, 2 },
+ { "name", required_argument, NULL, 3 },
+ { "cull", required_argument, NULL, 4 },
+ { "sort", required_argument, NULL, 5 },
+ { "help", no_argument, NULL, 'h' },
+ { 0, 0, 0, 0},
+ };
+
+ compare_flag = COMP_NO_FLAG;
+
+ while ((opt = getopt_long(argc, argv, "admnpstPh", longopts, NULL)) != -1)
+ switch (opt) {
+ case 'a':
+ compare_flag |= COMP_ALLOC;
+ break;
+ case 'd':
+ debug_on = true;
+ break;
+ case 'm':
+ compare_flag |= COMP_PAGE_NUM;
+ break;
+ case 'p':
+ compare_flag |= COMP_PID;
+ break;
+ case 's':
+ compare_flag |= COMP_STACK;
+ break;
+ case 't':
+ compare_flag |= COMP_NUM;
+ break;
+ case 'P':
+ compare_flag |= COMP_TGID;
+ break;
+ case 'n':
+ compare_flag |= COMP_COMM;
+ break;
+ case 'h':
+ usage();
+ exit(0);
+ case 1:
+ filter = filter | FILTER_PID;
+ fc.pids = parse_nums_list(optarg, &fc.pids_size);
+ if (fc.pids == NULL) {
+ fprintf(stderr, "wrong/invalid pid in from the command line:%s\n",
+ optarg);
+ exit(1);
+ }
+ break;
+ case 2:
+ filter = filter | FILTER_TGID;
+ fc.tgids = parse_nums_list(optarg, &fc.tgids_size);
+ if (fc.tgids == NULL) {
+ fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n",
+ optarg);
+ exit(1);
+ }
+ break;
+ case 3:
+ filter = filter | FILTER_COMM;
+ fc.comms = explode(',', optarg, &fc.comms_size);
+ break;
+ case 4:
+ if (!parse_cull_args(optarg)) {
+ fprintf(stderr, "wrong argument after --cull option:%s\n",
+ optarg);
+ exit(1);
+ }
+ break;
+ case 5:
+ if (!parse_sort_args(optarg)) {
+ fprintf(stderr, "wrong argument after --sort option:%s\n",
+ optarg);
+ exit(1);
+ }
+ break;
+ default:
+ usage();
+ exit(1);
+ }
+
+ if (optind >= (argc - 1)) {
+ usage();
+ exit(1);
+ }
+
+ /* Only one compare option is allowed, yet we also want handle the
+ * default case were no option is provided, but we still want to
+ * match the behavior of the -t option (compare by number of times
+ * a record is seen
+ */
+ switch (compare_flag) {
+ case COMP_ALLOC:
+ set_single_cmp(compare_ts, SORT_ASC);
+ break;
+ case COMP_PAGE_NUM:
+ set_single_cmp(compare_page_num, SORT_DESC);
+ break;
+ case COMP_PID:
+ set_single_cmp(compare_pid, SORT_ASC);
+ break;
+ case COMP_STACK:
+ set_single_cmp(compare_stacktrace, SORT_ASC);
+ break;
+ case COMP_NO_FLAG:
+ case COMP_NUM:
+ set_single_cmp(compare_num, SORT_DESC);
+ break;
+ case COMP_TGID:
+ set_single_cmp(compare_tgid, SORT_ASC);
+ break;
+ case COMP_COMM:
+ set_single_cmp(compare_comm, SORT_ASC);
+ break;
+ default:
+ usage();
+ exit(1);
+ }
+
+ fin = fopen(argv[optind], "r");
+ fout = fopen(argv[optind + 1], "w");
+ if (!fin || !fout) {
+ usage();
+ perror("open: ");
+ exit(1);
+ }
+
+ if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),"))
+ goto out_order;
+ if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"))
+ goto out_pid;
+ if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "))
+ goto out_tgid;
+ if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
+ goto out_comm;
+ if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns"))
+ goto out_ts;
+
+ fstat(fileno(fin), &st);
+ max_size = st.st_size / 100; /* hack ... */
+
+ list = malloc(max_size * sizeof(*list));
+ buf = malloc(BUF_SIZE);
+ ext_buf = malloc(BUF_SIZE);
+ if (!list || !buf || !ext_buf) {
+ fprintf(stderr, "Out of memory\n");
+ goto out_free;
+ }
+
+ for ( ; ; ) {
+ int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin);
+
+ if (buf_len < 0)
+ break;
+ if (!add_list(buf, buf_len, ext_buf))
+ goto out_free;
+ }
+
+ printf("loaded %d\n", list_size);
+
+ printf("sorting ....\n");
+
+ qsort(list, list_size, sizeof(list[0]), compare_cull_condition);
+
+ printf("culling\n");
+
+ for (i = count = 0; i < list_size; i++) {
+ if (count == 0 ||
+ compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) {
+ list[count++] = list[i];
+ } else {
+ list[count-1].num += list[i].num;
+ list[count-1].page_num += list[i].page_num;
+ }
+ }
+
+ qsort(list, count, sizeof(list[0]), compare_sort_condition);
+
+ for (i = 0; i < count; i++) {
+ if (cull == 0) {
+ fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num);
+ print_allocator(fout, list[i].allocator);
+ fprintf(fout, ":\n%s\n", list[i].txt);
+ }
+ else {
+ fprintf(fout, "%d times, %d pages",
+ list[i].num, list[i].page_num);
+ if (cull & CULL_PID || filter & FILTER_PID)
+ fprintf(fout, ", PID %d", list[i].pid);
+ if (cull & CULL_TGID || filter & FILTER_TGID)
+ fprintf(fout, ", TGID %d", list[i].tgid);
+ if (cull & CULL_COMM || filter & FILTER_COMM)
+ fprintf(fout, ", task_comm_name: %s", list[i].comm);
+ if (cull & CULL_ALLOCATOR) {
+ fprintf(fout, ", ");
+ print_allocator(fout, list[i].allocator);
+ }
+ if (cull & CULL_STACKTRACE)
+ fprintf(fout, ":\n%s", list[i].stacktrace);
+ fprintf(fout, "\n");
+ }
+ }
+
+out_free:
+ if (ext_buf)
+ free(ext_buf);
+ if (buf)
+ free(buf);
+ if (list)
+ free(list);
+out_ts:
+ regfree(&ts_nsec_pattern);
+out_comm:
+ regfree(&comm_pattern);
+out_tgid:
+ regfree(&tgid_pattern);
+out_pid:
+ regfree(&pid_pattern);
+out_order:
+ regfree(&order_pattern);
+
+ return 0;
+}
diff --git a/tools/mm/show_page_info.py b/tools/mm/show_page_info.py
new file mode 100644
index 000000000000..c46d8ea283d7
--- /dev/null
+++ b/tools/mm/show_page_info.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env drgn
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2025 Ye Liu <liuye@kylinos.cn>
+
+import argparse
+import sys
+from drgn import Object, FaultError, PlatformFlags, cast
+from drgn.helpers.linux import find_task, follow_page, page_size
+from drgn.helpers.linux.mm import (
+ decode_page_flags, page_to_pfn, page_to_phys, page_to_virt, vma_find,
+ PageSlab, PageCompound, PageHead, PageTail, compound_head, compound_order, compound_nr
+)
+from drgn.helpers.linux.cgroup import cgroup_name, cgroup_path
+
+DESC = """
+This is a drgn script to show the page state.
+For more info on drgn, visit https://github.com/osandov/drgn.
+"""
+
+def format_page_data(page):
+ """
+ Format raw page data into a readable hex dump with "RAW:" prefix.
+
+ :param page: drgn.Object instance representing the page.
+ :return: Formatted string of memory contents.
+ """
+ try:
+ address = page.value_()
+ size = prog.type("struct page").size
+
+ if prog.platform.flags & PlatformFlags.IS_64_BIT:
+ word_size = 8
+ else:
+ word_size = 4
+ num_words = size // word_size
+
+ values = []
+ for i in range(num_words):
+ word_address = address + i * word_size
+ word = prog.read_word(word_address)
+ values.append(f"{word:0{word_size * 2}x}")
+
+ lines = [f"RAW: {' '.join(values[i:i + 4])}" for i in range(0, len(values), 4)]
+
+ return "\n".join(lines)
+
+ except FaultError as e:
+ return f"Error reading memory: {e}"
+ except Exception as e:
+ return f"Unexpected error: {e}"
+
+def get_memcg_info(page):
+ """Retrieve memory cgroup information for a page."""
+ try:
+ MEMCG_DATA_OBJEXTS = prog.constant("MEMCG_DATA_OBJEXTS").value_()
+ MEMCG_DATA_KMEM = prog.constant("MEMCG_DATA_KMEM").value_()
+ mask = prog.constant('__NR_MEMCG_DATA_FLAGS').value_() - 1
+ memcg_data = page.memcg_data.read_()
+ if memcg_data & MEMCG_DATA_OBJEXTS:
+ slabobj_ext = cast("struct slabobj_ext *", memcg_data & ~mask)
+ memcg = slabobj_ext.objcg.memcg.value_()
+ elif memcg_data & MEMCG_DATA_KMEM:
+ objcg = cast("struct obj_cgroup *", memcg_data & ~mask)
+ memcg = objcg.memcg.value_()
+ else:
+ memcg = cast("struct mem_cgroup *", memcg_data & ~mask)
+
+ if memcg.value_() == 0:
+ return "none", "/sys/fs/cgroup/memory/"
+ cgrp = memcg.css.cgroup
+ return cgroup_name(cgrp).decode(), f"/sys/fs/cgroup/memory{cgroup_path(cgrp).decode()}"
+ except FaultError as e:
+ return "unknown", f"Error retrieving memcg info: {e}"
+ except Exception as e:
+ return "unknown", f"Unexpected error: {e}"
+
+def show_page_state(page, addr, mm, pid, task):
+ """Display detailed information about a page."""
+ try:
+ print(f'PID: {pid} Comm: {task.comm.string_().decode()} mm: {hex(mm)}')
+ try:
+ print(format_page_data(page))
+ except FaultError as e:
+ print(f"Error reading page data: {e}")
+ fields = {
+ "Page Address": hex(page.value_()),
+ "Page Flags": decode_page_flags(page),
+ "Page Size": prog["PAGE_SIZE"].value_(),
+ "Page PFN": hex(page_to_pfn(page).value_()),
+ "Page Physical": hex(page_to_phys(page).value_()),
+ "Page Virtual": hex(page_to_virt(page).value_()),
+ "Page Refcount": page._refcount.counter.value_(),
+ "Page Mapcount": page._mapcount.counter.value_(),
+ "Page Index": hex(page.__folio_index.value_()),
+ "Page Memcg Data": hex(page.memcg_data.value_()),
+ }
+
+ memcg_name, memcg_path = get_memcg_info(page)
+ fields["Memcg Name"] = memcg_name
+ fields["Memcg Path"] = memcg_path
+ fields["Page Mapping"] = hex(page.mapping.value_())
+ fields["Page Anon/File"] = "Anon" if page.mapping.value_() & 0x1 else "File"
+
+ try:
+ vma = vma_find(mm, addr)
+ fields["Page VMA"] = hex(vma.value_())
+ fields["VMA Start"] = hex(vma.vm_start.value_())
+ fields["VMA End"] = hex(vma.vm_end.value_())
+ except FaultError as e:
+ fields["Page VMA"] = "Unavailable"
+ fields["VMA Start"] = "Unavailable"
+ fields["VMA End"] = "Unavailable"
+ print(f"Error retrieving VMA information: {e}")
+
+ # Calculate the maximum field name length for alignment
+ max_field_len = max(len(field) for field in fields)
+
+ # Print aligned fields
+ for field, value in fields.items():
+ print(f"{field}:".ljust(max_field_len + 2) + f"{value}")
+
+ # Additional information about the page
+ if PageSlab(page):
+ print("This page belongs to the slab allocator.")
+
+ if PageCompound(page):
+ print("This page is part of a compound page.")
+ if PageHead(page):
+ print("This page is the head page of a compound page.")
+ if PageTail(page):
+ print("This page is the tail page of a compound page.")
+ print(f"{'Head Page:'.ljust(max_field_len + 2)}{hex(compound_head(page).value_())}")
+ print(f"{'Compound Order:'.ljust(max_field_len + 2)}{compound_order(page).value_()}")
+ print(f"{'Number of Pages:'.ljust(max_field_len + 2)}{compound_nr(page).value_()}")
+ else:
+ print("This page is not part of a compound page.")
+ except FaultError as e:
+ print(f"Error accessing page state: {e}")
+ except Exception as e:
+ print(f"Unexpected error: {e}")
+
+def main():
+ """Main function to parse arguments and display page state."""
+ parser = argparse.ArgumentParser(description=DESC, formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument('pid', metavar='PID', type=int, help='Target process ID (PID)')
+ parser.add_argument('vaddr', metavar='VADDR', type=str, help='Target virtual address in hexadecimal format (e.g., 0x7fff1234abcd)')
+ args = parser.parse_args()
+
+ try:
+ vaddr = int(args.vaddr, 16)
+ except ValueError:
+ sys.exit(f"Error: Invalid virtual address format: {args.vaddr}")
+
+ try:
+ task = find_task(args.pid)
+ mm = task.mm
+ page = follow_page(mm, vaddr)
+
+ if page:
+ show_page_state(page, vaddr, mm, args.pid, task)
+ else:
+ sys.exit(f"Address {hex(vaddr)} is not mapped.")
+ except FaultError as e:
+ sys.exit(f"Error accessing task or memory: {e}")
+ except Exception as e:
+ sys.exit(f"Unexpected error: {e}")
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/mm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh
new file mode 100644
index 000000000000..873a892147e5
--- /dev/null
+++ b/tools/mm/slabinfo-gnuplot.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Sergey Senozhatsky, 2015
+# sergey.senozhatsky.work@gmail.com
+#
+
+
+# This program is intended to plot a `slabinfo -X' stats, collected,
+# for example, using the following command:
+# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
+#
+# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
+# and generate graphs (totals, slabs sorted by size, slabs sorted
+# by size).
+#
+# Graphs can be [individually] regenerate with different ranges and
+# size (-r %d,%d and -s %d,%d options).
+#
+# To visually compare N `totals' graphs, do
+# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
+#
+
+min_slab_name_size=11
+xmin=0
+xmax=0
+width=1500
+height=700
+mode=preprocess
+
+usage()
+{
+ echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
+ echo "FILEs must contain 'slabinfo -X' samples"
+ echo "-t - plot totals for FILE(s)"
+ echo "-l - plot slabs stats for FILE(s)"
+ echo "-s %d,%d - set image width and height"
+ echo "-r %d,%d - use data samples from a given range"
+}
+
+check_file_exist()
+{
+ if [ ! -f "$1" ]; then
+ echo "File '$1' does not exist"
+ exit 1
+ fi
+}
+
+do_slabs_plotting()
+{
+ local file=$1
+ local out_file
+ local range="every ::$xmin"
+ local xtic=""
+ local xtic_rotate="norotate"
+ local lines=2000000
+ local wc_lines
+
+ check_file_exist "$file"
+
+ out_file=`basename "$file"`
+ if [ $xmax -ne 0 ]; then
+ range="$range::$xmax"
+ lines=$((xmax-xmin))
+ fi
+
+ wc_lines=`cat "$file" | wc -l`
+ if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
+ wc_lines=$lines
+ fi
+
+ if [ "$wc_lines" -lt "$lines" ]; then
+ lines=$wc_lines
+ fi
+
+ if [ $((width / lines)) -gt $min_slab_name_size ]; then
+ xtic=":xtic(1)"
+ xtic_rotate=90
+ fi
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set output '$out_file.png'
+set autoscale xy
+set xlabel 'samples'
+set ylabel 'bytes'
+set style histogram columnstacked title textcolor lt -1
+set style fill solid 0.15
+set xtics rotate $xtic_rotate
+set key left above Left title reverse
+
+plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
+ '' $range u 3 title 'LOSS' with boxes
+EOF
+
+ if [ $? -eq 0 ]; then
+ echo "$out_file.png"
+ fi
+}
+
+do_totals_plotting()
+{
+ local gnuplot_cmd=""
+ local range="every ::$xmin"
+ local file=""
+
+ if [ $xmax -ne 0 ]; then
+ range="$range::$xmax"
+ fi
+
+ for i in "${t_files[@]}"; do
+ check_file_exist "$i"
+
+ file="$file"`basename "$i"`
+ gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
+ '$i Memory usage' with lines,"
+ gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
+ '$i Loss' with lines,"
+ done
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set autoscale xy
+set output '$file.png'
+set xlabel 'samples'
+set ylabel 'bytes'
+set key left above Left title reverse
+
+plot $gnuplot_cmd
+EOF
+
+ if [ $? -eq 0 ]; then
+ echo "$file.png"
+ fi
+}
+
+do_preprocess()
+{
+ local out
+ local lines
+ local in=$1
+
+ check_file_exist "$in"
+
+ # use only 'TOP' slab (biggest memory usage or loss)
+ let lines=3
+ out=`basename "$in"`"-slabs-by-loss"
+ `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
+ grep -E -iv '\-\-|Name|Slabs'\
+ | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
+ if [ $? -eq 0 ]; then
+ do_slabs_plotting "$out"
+ fi
+
+ let lines=3
+ out=`basename "$in"`"-slabs-by-size"
+ `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
+ grep -E -iv '\-\-|Name|Slabs'\
+ | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
+ if [ $? -eq 0 ]; then
+ do_slabs_plotting "$out"
+ fi
+
+ out=`basename "$in"`"-totals"
+ `cat "$in" | grep "Memory used" |\
+ awk '{print $3" "$7}' > "$out"`
+ if [ $? -eq 0 ]; then
+ t_files[0]=$out
+ do_totals_plotting
+ fi
+}
+
+parse_opts()
+{
+ local opt
+
+ while getopts "tlr::s::h" opt; do
+ case $opt in
+ t)
+ mode=totals
+ ;;
+ l)
+ mode=slabs
+ ;;
+ s)
+ array=(${OPTARG//,/ })
+ width=${array[0]}
+ height=${array[1]}
+ ;;
+ r)
+ array=(${OPTARG//,/ })
+ xmin=${array[0]}
+ xmax=${array[1]}
+ ;;
+ h)
+ usage
+ exit 0
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ exit 1
+ ;;
+ :)
+ echo "-$OPTARG requires an argument." >&2
+ exit 1
+ ;;
+ esac
+ done
+
+ return $OPTIND
+}
+
+parse_args()
+{
+ local idx=0
+ local p
+
+ for p in "$@"; do
+ case $mode in
+ preprocess)
+ files[$idx]=$p
+ idx=$idx+1
+ ;;
+ totals)
+ t_files[$idx]=$p
+ idx=$idx+1
+ ;;
+ slabs)
+ files[$idx]=$p
+ idx=$idx+1
+ ;;
+ esac
+ done
+}
+
+parse_opts "$@"
+argstart=$?
+parse_args "${@:$argstart}"
+
+if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
+ usage
+ exit 1
+fi
+
+case $mode in
+ preprocess)
+ for i in "${files[@]}"; do
+ do_preprocess "$i"
+ done
+ ;;
+ totals)
+ do_totals_plotting
+ ;;
+ slabs)
+ for i in "${files[@]}"; do
+ do_slabs_plotting "$i"
+ done
+ ;;
+ *)
+ echo "Unknown mode $mode" >&2
+ usage
+ exit 1
+ ;;
+esac
diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
new file mode 100644
index 000000000000..80cdbd3db82d
--- /dev/null
+++ b/tools/mm/slabinfo.c
@@ -0,0 +1,1549 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Slabinfo: Tool to get reports about slabs
+ *
+ * (C) 2007 sgi, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
+ *
+ * Compile with:
+ *
+ * gcc -o slabinfo slabinfo.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <strings.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <regex.h>
+#include <errno.h>
+
+#define MAX_SLABS 2000
+#define MAX_ALIASES 500
+#define MAX_NODES 1024
+
+struct slabinfo {
+ char *name;
+ int alias;
+ int refs;
+ int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+ unsigned int hwcache_align, object_size, objs_per_slab;
+ unsigned int sanity_checks, slab_size, store_user, trace;
+ int order, poison, reclaim_account, red_zone;
+ unsigned long partial, objects, slabs, objects_partial, objects_total;
+ unsigned long alloc_fastpath, alloc_slowpath;
+ unsigned long free_fastpath, free_slowpath;
+ unsigned long free_frozen, free_add_partial, free_remove_partial;
+ unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
+ unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
+ unsigned long deactivate_to_head, deactivate_to_tail;
+ unsigned long deactivate_remote_frees, order_fallback;
+ unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail;
+ unsigned long alloc_node_mismatch, deactivate_bypass;
+ unsigned long cpu_partial_alloc, cpu_partial_free;
+ int numa[MAX_NODES];
+ int numa_partial[MAX_NODES];
+} slabinfo[MAX_SLABS];
+
+struct aliasinfo {
+ char *name;
+ char *ref;
+ struct slabinfo *slab;
+} aliasinfo[MAX_ALIASES];
+
+int slabs;
+int actual_slabs;
+int aliases;
+int alias_targets;
+int highest_node;
+
+char buffer[4096];
+
+int show_empty;
+int show_report;
+int show_alias;
+int show_slab;
+int skip_zero = 1;
+int show_numa;
+int show_track;
+int show_first_alias;
+int validate;
+int shrink;
+int show_inverted;
+int show_single_ref;
+int show_totals;
+int sort_size;
+int sort_active;
+int set_debug;
+int show_ops;
+int sort_partial;
+int show_activity;
+int output_lines = -1;
+int sort_loss;
+int extended_totals;
+int show_bytes;
+int unreclaim_only;
+
+/* Debug options */
+int sanity;
+int redzone;
+int poison;
+int tracking;
+int tracing;
+
+int page_size;
+
+regex_t pattern;
+
+static void fatal(const char *x, ...)
+{
+ va_list ap;
+
+ va_start(ap, x);
+ vfprintf(stderr, x, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static void usage(void)
+{
+ printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
+ "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n"
+ "-a|--aliases Show aliases\n"
+ "-A|--activity Most active slabs first\n"
+ "-B|--Bytes Show size in bytes\n"
+ "-D|--display-active Switch line format to activity\n"
+ "-e|--empty Show empty slabs\n"
+ "-f|--first-alias Show first alias\n"
+ "-h|--help Show usage information\n"
+ "-i|--inverted Inverted list\n"
+ "-l|--slabs Show slabs\n"
+ "-L|--Loss Sort by loss\n"
+ "-n|--numa Show NUMA information\n"
+ "-N|--lines=K Show the first K slabs\n"
+ "-o|--ops Show kmem_cache_ops\n"
+ "-P|--partial Sort by number of partial slabs\n"
+ "-r|--report Detailed report on single slabs\n"
+ "-s|--shrink Shrink slabs\n"
+ "-S|--Size Sort by size\n"
+ "-t|--tracking Show alloc/free information\n"
+ "-T|--Totals Show summary information\n"
+ "-U|--Unreclaim Show unreclaimable slabs only\n"
+ "-v|--validate Validate slabs\n"
+ "-X|--Xtotals Show extended summary information\n"
+ "-z|--zero Include empty slabs\n"
+ "-1|--1ref Single reference\n"
+
+ "\n"
+ "-d | --debug Switch off all debug options\n"
+ "-da | --debug=a Switch on all debug options (--debug=FZPU)\n"
+
+ "\n"
+ "-d[afzput] | --debug=[afzput]\n"
+ " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
+ " z | Z Redzoning\n"
+ " p | P Poisoning\n"
+ " u | U Tracking\n"
+ " t | T Tracing\n"
+
+ "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n"
+ );
+}
+
+static unsigned long read_obj(const char *name)
+{
+ size_t len;
+ FILE *f = fopen(name, "r");
+
+ if (!f) {
+ buffer[0] = 0;
+ if (errno == EACCES)
+ fatal("%s, Try using superuser\n", strerror(errno));
+ } else {
+ if (!fgets(buffer, sizeof(buffer), f))
+ buffer[0] = 0;
+ fclose(f);
+ len = strlen(buffer);
+
+ if (len > 0 && buffer[len - 1] == '\n')
+ buffer[len - 1] = 0;
+ }
+ return strlen(buffer);
+}
+
+
+/*
+ * Get the contents of an attribute
+ */
+static unsigned long get_obj(const char *name)
+{
+ if (!read_obj(name))
+ return 0;
+
+ return atol(buffer);
+}
+
+static unsigned long get_obj_and_str(const char *name, char **x)
+{
+ unsigned long result = 0;
+ char *p;
+
+ *x = NULL;
+
+ if (!read_obj(name)) {
+ x = NULL;
+ return 0;
+ }
+ result = strtoul(buffer, &p, 10);
+ while (*p == ' ')
+ p++;
+ if (*p)
+ *x = strdup(p);
+ return result;
+}
+
+static void set_obj(struct slabinfo *s, const char *name, int n)
+{
+ char x[100];
+ FILE *f;
+
+ snprintf(x, 100, "%s/%s", s->name, name);
+ f = fopen(x, "w");
+ if (!f)
+ fatal("Cannot write to %s\n", x);
+
+ fprintf(f, "%d\n", n);
+ fclose(f);
+}
+
+static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
+{
+ char x[100];
+ FILE *f;
+ size_t l;
+
+ snprintf(x, 100, "%s/%s", s->name, name);
+ f = fopen(x, "r");
+ if (!f) {
+ buffer[0] = 0;
+ l = 0;
+ } else {
+ l = fread(buffer, 1, sizeof(buffer), f);
+ buffer[l] = 0;
+ fclose(f);
+ }
+ return l;
+}
+
+static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name)
+{
+ char x[128];
+ FILE *f;
+ size_t l;
+
+ snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name);
+ f = fopen(x, "r");
+ if (!f) {
+ buffer[0] = 0;
+ l = 0;
+ } else {
+ l = fread(buffer, 1, sizeof(buffer), f);
+ buffer[l] = 0;
+ fclose(f);
+ }
+ return l;
+}
+
+/*
+ * Put a size string together
+ */
+static int store_size(char *buffer, unsigned long value)
+{
+ unsigned long divisor = 1;
+ char trailer = 0;
+ int n;
+
+ if (!show_bytes) {
+ if (value > 1000000000UL) {
+ divisor = 100000000UL;
+ trailer = 'G';
+ } else if (value > 1000000UL) {
+ divisor = 100000UL;
+ trailer = 'M';
+ } else if (value > 1000UL) {
+ divisor = 100;
+ trailer = 'K';
+ }
+ }
+
+ value /= divisor;
+ n = sprintf(buffer, "%ld",value);
+ if (trailer) {
+ buffer[n] = trailer;
+ n++;
+ buffer[n] = 0;
+ }
+ if (divisor != 1) {
+ memmove(buffer + n - 2, buffer + n - 3, 4);
+ buffer[n-2] = '.';
+ n++;
+ }
+ return n;
+}
+
+static void decode_numa_list(int *numa, char *t)
+{
+ int node;
+ int nr;
+
+ memset(numa, 0, MAX_NODES * sizeof(int));
+
+ if (!t)
+ return;
+
+ while (*t == 'N') {
+ t++;
+ node = strtoul(t, &t, 10);
+ if (*t == '=') {
+ t++;
+ nr = strtoul(t, &t, 10);
+ numa[node] = nr;
+ if (node > highest_node)
+ highest_node = node;
+ }
+ while (*t == ' ')
+ t++;
+ }
+}
+
+static void slab_validate(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ set_obj(s, "validate", 1);
+}
+
+static void slab_shrink(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ set_obj(s, "shrink", 1);
+}
+
+int line = 0;
+
+static void first_line(void)
+{
+ if (show_activity)
+ printf("Name Objects Alloc Free"
+ " %%Fast Fallb O CmpX UL\n");
+ else
+ printf("Name Objects Objsize %s "
+ "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n",
+ sort_loss ? " Loss" : "Space");
+}
+
+/*
+ * Find the shortest alias of a slab
+ */
+static struct aliasinfo *find_one_alias(struct slabinfo *find)
+{
+ struct aliasinfo *a;
+ struct aliasinfo *best = NULL;
+
+ for(a = aliasinfo;a < aliasinfo + aliases; a++) {
+ if (a->slab == find &&
+ (!best || strlen(best->name) < strlen(a->name))) {
+ best = a;
+ if (strncmp(a->name,"kmall", 5) == 0)
+ return best;
+ }
+ }
+ return best;
+}
+
+static unsigned long slab_size(struct slabinfo *s)
+{
+ return s->slabs * (page_size << s->order);
+}
+
+static unsigned long slab_activity(struct slabinfo *s)
+{
+ return s->alloc_fastpath + s->free_fastpath +
+ s->alloc_slowpath + s->free_slowpath;
+}
+
+static unsigned long slab_waste(struct slabinfo *s)
+{
+ return slab_size(s) - s->objects * s->object_size;
+}
+
+static void slab_numa(struct slabinfo *s, int mode)
+{
+ int node;
+
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (!highest_node) {
+ printf("\n%s: No NUMA information available.\n", s->name);
+ return;
+ }
+
+ if (skip_zero && !s->slabs)
+ return;
+
+ if (!line) {
+ printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+ for(node = 0; node <= highest_node; node++)
+ printf(" %4d", node);
+ printf("\n----------------------");
+ for(node = 0; node <= highest_node; node++)
+ printf("-----");
+ printf("\n");
+ }
+ printf("%-21s ", mode ? "All slabs" : s->name);
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ if (mode) {
+ printf("%-21s ", "Partial slabs");
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa_partial[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ }
+ line++;
+}
+
+static void show_tracking(struct slabinfo *s)
+{
+ printf("\n%s: Kernel object allocation\n", s->name);
+ printf("-----------------------------------------------------------------------\n");
+ if (read_debug_slab_obj(s, "alloc_traces"))
+ printf("%s", buffer);
+ else if (read_slab_obj(s, "alloc_calls"))
+ printf("%s", buffer);
+ else
+ printf("No Data\n");
+
+ printf("\n%s: Kernel object freeing\n", s->name);
+ printf("------------------------------------------------------------------------\n");
+ if (read_debug_slab_obj(s, "free_traces"))
+ printf("%s", buffer);
+ else if (read_slab_obj(s, "free_calls"))
+ printf("%s", buffer);
+ else
+ printf("No Data\n");
+
+}
+
+static void ops(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (read_slab_obj(s, "ops")) {
+ printf("\n%s: kmem_cache operations\n", s->name);
+ printf("--------------------------------------------\n");
+ printf("%s", buffer);
+ } else
+ printf("\n%s has no kmem_cache operations\n", s->name);
+}
+
+static const char *onoff(int x)
+{
+ if (x)
+ return "On ";
+ return "Off";
+}
+
+static void slab_stats(struct slabinfo *s)
+{
+ unsigned long total_alloc;
+ unsigned long total_free;
+ unsigned long total;
+
+ if (!s->alloc_slab)
+ return;
+
+ total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+ total_free = s->free_fastpath + s->free_slowpath;
+
+ if (!total_alloc)
+ return;
+
+ printf("\n");
+ printf("Slab Perf Counter Alloc Free %%Al %%Fr\n");
+ printf("--------------------------------------------------\n");
+ printf("Fastpath %8lu %8lu %3lu %3lu\n",
+ s->alloc_fastpath, s->free_fastpath,
+ s->alloc_fastpath * 100 / total_alloc,
+ total_free ? s->free_fastpath * 100 / total_free : 0);
+ printf("Slowpath %8lu %8lu %3lu %3lu\n",
+ total_alloc - s->alloc_fastpath, s->free_slowpath,
+ (total_alloc - s->alloc_fastpath) * 100 / total_alloc,
+ total_free ? s->free_slowpath * 100 / total_free : 0);
+ printf("Page Alloc %8lu %8lu %3lu %3lu\n",
+ s->alloc_slab, s->free_slab,
+ s->alloc_slab * 100 / total_alloc,
+ total_free ? s->free_slab * 100 / total_free : 0);
+ printf("Add partial %8lu %8lu %3lu %3lu\n",
+ s->deactivate_to_head + s->deactivate_to_tail,
+ s->free_add_partial,
+ (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
+ total_free ? s->free_add_partial * 100 / total_free : 0);
+ printf("Remove partial %8lu %8lu %3lu %3lu\n",
+ s->alloc_from_partial, s->free_remove_partial,
+ s->alloc_from_partial * 100 / total_alloc,
+ total_free ? s->free_remove_partial * 100 / total_free : 0);
+
+ printf("Cpu partial list %8lu %8lu %3lu %3lu\n",
+ s->cpu_partial_alloc, s->cpu_partial_free,
+ s->cpu_partial_alloc * 100 / total_alloc,
+ total_free ? s->cpu_partial_free * 100 / total_free : 0);
+
+ printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
+ s->deactivate_remote_frees, s->free_frozen,
+ s->deactivate_remote_frees * 100 / total_alloc,
+ total_free ? s->free_frozen * 100 / total_free : 0);
+
+ printf("Total %8lu %8lu\n\n", total_alloc, total_free);
+
+ if (s->cpuslab_flush)
+ printf("Flushes %8lu\n", s->cpuslab_flush);
+
+ total = s->deactivate_full + s->deactivate_empty +
+ s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass;
+
+ if (total) {
+ printf("\nSlab Deactivation Occurrences %%\n");
+ printf("-------------------------------------------------\n");
+ printf("Slab full %7lu %3lu%%\n",
+ s->deactivate_full, (s->deactivate_full * 100) / total);
+ printf("Slab empty %7lu %3lu%%\n",
+ s->deactivate_empty, (s->deactivate_empty * 100) / total);
+ printf("Moved to head of partial list %7lu %3lu%%\n",
+ s->deactivate_to_head, (s->deactivate_to_head * 100) / total);
+ printf("Moved to tail of partial list %7lu %3lu%%\n",
+ s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
+ printf("Deactivation bypass %7lu %3lu%%\n",
+ s->deactivate_bypass, (s->deactivate_bypass * 100) / total);
+ printf("Refilled from foreign frees %7lu %3lu%%\n",
+ s->alloc_refill, (s->alloc_refill * 100) / total);
+ printf("Node mismatch %7lu %3lu%%\n",
+ s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total);
+ }
+
+ if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) {
+ printf("\nCmpxchg_double Looping\n------------------------\n");
+ printf("Locked Cmpxchg Double redos %lu\nUnlocked Cmpxchg Double redos %lu\n",
+ s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail);
+ }
+}
+
+static void report(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n",
+ s->name, s->aliases, s->order, s->objects);
+ if (s->hwcache_align)
+ printf("** Hardware cacheline aligned\n");
+ if (s->cache_dma)
+ printf("** Memory is allocated in a special DMA zone\n");
+ if (s->destroy_by_rcu)
+ printf("** Slabs are destroyed via RCU\n");
+ if (s->reclaim_account)
+ printf("** Reclaim accounting active\n");
+
+ printf("\nSizes (bytes) Slabs Debug Memory\n");
+ printf("------------------------------------------------------------------------\n");
+ printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n",
+ s->object_size, s->slabs, onoff(s->sanity_checks),
+ s->slabs * (page_size << s->order));
+ printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n",
+ s->slab_size, s->slabs - s->partial - s->cpu_slabs,
+ onoff(s->red_zone), s->objects * s->object_size);
+ printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n",
+ page_size << s->order, s->partial, onoff(s->poison),
+ s->slabs * (page_size << s->order) - s->objects * s->object_size);
+ printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n",
+ s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
+ (s->slab_size - s->object_size) * s->objects);
+ printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n",
+ s->align, s->objs_per_slab, onoff(s->trace),
+ ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
+ s->slabs);
+
+ ops(s);
+ show_tracking(s);
+ slab_numa(s, 1);
+ slab_stats(s);
+}
+
+static void slabcache(struct slabinfo *s)
+{
+ char size_str[20];
+ char dist_str[40];
+ char flags[20];
+ char *p = flags;
+
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (unreclaim_only && s->reclaim_account)
+ return;
+
+ if (actual_slabs == 1) {
+ report(s);
+ return;
+ }
+
+ if (skip_zero && !show_empty && !s->slabs)
+ return;
+
+ if (show_empty && s->slabs)
+ return;
+
+ if (sort_loss == 0)
+ store_size(size_str, slab_size(s));
+ else
+ store_size(size_str, slab_waste(s));
+ snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
+ s->partial, s->cpu_slabs);
+
+ if (!line++)
+ first_line();
+
+ if (s->aliases)
+ *p++ = '*';
+ if (s->cache_dma)
+ *p++ = 'd';
+ if (s->hwcache_align)
+ *p++ = 'A';
+ if (s->poison)
+ *p++ = 'P';
+ if (s->reclaim_account)
+ *p++ = 'a';
+ if (s->red_zone)
+ *p++ = 'Z';
+ if (s->sanity_checks)
+ *p++ = 'F';
+ if (s->store_user)
+ *p++ = 'U';
+ if (s->trace)
+ *p++ = 'T';
+
+ *p = 0;
+ if (show_activity) {
+ unsigned long total_alloc;
+ unsigned long total_free;
+
+ total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+ total_free = s->free_fastpath + s->free_slowpath;
+
+ printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n",
+ s->name, s->objects,
+ total_alloc, total_free,
+ total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
+ total_free ? (s->free_fastpath * 100 / total_free) : 0,
+ s->order_fallback, s->order, s->cmpxchg_double_fail,
+ s->cmpxchg_double_cpu_fail);
+ } else {
+ printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
+ s->name, s->objects, s->object_size, size_str, dist_str,
+ s->objs_per_slab, s->order,
+ s->slabs ? (s->partial * 100) / s->slabs : 100,
+ s->slabs ? (s->objects * s->object_size * 100) /
+ (s->slabs * (page_size << s->order)) : 100,
+ flags);
+ }
+}
+
+/*
+ * Analyze debug options. Return false if something is amiss.
+ */
+static int debug_opt_scan(char *opt)
+{
+ if (!opt || !opt[0] || strcmp(opt, "-") == 0)
+ return 1;
+
+ if (strcasecmp(opt, "a") == 0) {
+ sanity = 1;
+ poison = 1;
+ redzone = 1;
+ tracking = 1;
+ return 1;
+ }
+
+ for ( ; *opt; opt++)
+ switch (*opt) {
+ case 'F' : case 'f':
+ if (sanity)
+ return 0;
+ sanity = 1;
+ break;
+ case 'P' : case 'p':
+ if (poison)
+ return 0;
+ poison = 1;
+ break;
+
+ case 'Z' : case 'z':
+ if (redzone)
+ return 0;
+ redzone = 1;
+ break;
+
+ case 'U' : case 'u':
+ if (tracking)
+ return 0;
+ tracking = 1;
+ break;
+
+ case 'T' : case 't':
+ if (tracing)
+ return 0;
+ tracing = 1;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+static int slab_empty(struct slabinfo *s)
+{
+ if (s->objects > 0)
+ return 0;
+
+ /*
+ * We may still have slabs even if there are no objects. Shrinking will
+ * remove them.
+ */
+ if (s->slabs != 0)
+ set_obj(s, "shrink", 1);
+
+ return 1;
+}
+
+static void slab_debug(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (sanity && !s->sanity_checks) {
+ set_obj(s, "sanity_checks", 1);
+ }
+ if (!sanity && s->sanity_checks) {
+ if (slab_empty(s))
+ set_obj(s, "sanity_checks", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
+ }
+ if (redzone && !s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
+ }
+ if (!redzone && s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
+ }
+ if (poison && !s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
+ }
+ if (!poison && s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
+ }
+ if (tracking && !s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
+ }
+ if (!tracking && s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
+ }
+ if (tracing && !s->trace) {
+ if (slabs == 1)
+ set_obj(s, "trace", 1);
+ else
+ fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
+ }
+ if (!tracing && s->trace)
+ set_obj(s, "trace", 1);
+}
+
+static void totals(void)
+{
+ struct slabinfo *s;
+
+ int used_slabs = 0;
+ char b1[20], b2[20], b3[20], b4[20];
+ unsigned long long max = 1ULL << 63;
+
+ /* Object size */
+ unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
+
+ /* Number of partial slabs in a slabcache */
+ unsigned long long min_partial = max, max_partial = 0,
+ avg_partial, total_partial = 0;
+
+ /* Number of slabs in a slab cache */
+ unsigned long long min_slabs = max, max_slabs = 0,
+ avg_slabs, total_slabs = 0;
+
+ /* Size of the whole slab */
+ unsigned long long min_size = max, max_size = 0,
+ avg_size, total_size = 0;
+
+ /* Bytes used for object storage in a slab */
+ unsigned long long min_used = max, max_used = 0,
+ avg_used, total_used = 0;
+
+ /* Waste: Bytes used for alignment and padding */
+ unsigned long long min_waste = max, max_waste = 0,
+ avg_waste, total_waste = 0;
+ /* Number of objects in a slab */
+ unsigned long long min_objects = max, max_objects = 0,
+ avg_objects, total_objects = 0;
+ /* Waste per object */
+ unsigned long long min_objwaste = max,
+ max_objwaste = 0, avg_objwaste,
+ total_objwaste = 0;
+
+ /* Memory per object */
+ unsigned long long min_memobj = max,
+ max_memobj = 0, avg_memobj,
+ total_objsize = 0;
+
+ /* Percentage of partial slabs per slab */
+ unsigned long min_ppart = 100, max_ppart = 0,
+ avg_ppart, total_ppart = 0;
+
+ /* Number of objects in partial slabs */
+ unsigned long min_partobj = max, max_partobj = 0,
+ avg_partobj, total_partobj = 0;
+
+ /* Percentage of partial objects of all objects in a slab */
+ unsigned long min_ppartobj = 100, max_ppartobj = 0,
+ avg_ppartobj, total_ppartobj = 0;
+
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ unsigned long long size;
+ unsigned long used;
+ unsigned long long wasted;
+ unsigned long long objwaste;
+ unsigned long percentage_partial_slabs;
+ unsigned long percentage_partial_objs;
+
+ if (!s->slabs || !s->objects)
+ continue;
+
+ used_slabs++;
+
+ size = slab_size(s);
+ used = s->objects * s->object_size;
+ wasted = size - used;
+ objwaste = s->slab_size - s->object_size;
+
+ percentage_partial_slabs = s->partial * 100 / s->slabs;
+ if (percentage_partial_slabs > 100)
+ percentage_partial_slabs = 100;
+
+ percentage_partial_objs = s->objects_partial * 100
+ / s->objects;
+
+ if (percentage_partial_objs > 100)
+ percentage_partial_objs = 100;
+
+ if (s->object_size < min_objsize)
+ min_objsize = s->object_size;
+ if (s->partial < min_partial)
+ min_partial = s->partial;
+ if (s->slabs < min_slabs)
+ min_slabs = s->slabs;
+ if (size < min_size)
+ min_size = size;
+ if (wasted < min_waste)
+ min_waste = wasted;
+ if (objwaste < min_objwaste)
+ min_objwaste = objwaste;
+ if (s->objects < min_objects)
+ min_objects = s->objects;
+ if (used < min_used)
+ min_used = used;
+ if (s->objects_partial < min_partobj)
+ min_partobj = s->objects_partial;
+ if (percentage_partial_slabs < min_ppart)
+ min_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs < min_ppartobj)
+ min_ppartobj = percentage_partial_objs;
+ if (s->slab_size < min_memobj)
+ min_memobj = s->slab_size;
+
+ if (s->object_size > max_objsize)
+ max_objsize = s->object_size;
+ if (s->partial > max_partial)
+ max_partial = s->partial;
+ if (s->slabs > max_slabs)
+ max_slabs = s->slabs;
+ if (size > max_size)
+ max_size = size;
+ if (wasted > max_waste)
+ max_waste = wasted;
+ if (objwaste > max_objwaste)
+ max_objwaste = objwaste;
+ if (s->objects > max_objects)
+ max_objects = s->objects;
+ if (used > max_used)
+ max_used = used;
+ if (s->objects_partial > max_partobj)
+ max_partobj = s->objects_partial;
+ if (percentage_partial_slabs > max_ppart)
+ max_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs > max_ppartobj)
+ max_ppartobj = percentage_partial_objs;
+ if (s->slab_size > max_memobj)
+ max_memobj = s->slab_size;
+
+ total_partial += s->partial;
+ total_slabs += s->slabs;
+ total_size += size;
+ total_waste += wasted;
+
+ total_objects += s->objects;
+ total_used += used;
+ total_partobj += s->objects_partial;
+ total_ppart += percentage_partial_slabs;
+ total_ppartobj += percentage_partial_objs;
+
+ total_objwaste += s->objects * objwaste;
+ total_objsize += s->objects * s->slab_size;
+ }
+
+ if (!total_objects) {
+ printf("No objects\n");
+ return;
+ }
+ if (!used_slabs) {
+ printf("No slabs\n");
+ return;
+ }
+
+ /* Per slab averages */
+ avg_partial = total_partial / used_slabs;
+ avg_slabs = total_slabs / used_slabs;
+ avg_size = total_size / used_slabs;
+ avg_waste = total_waste / used_slabs;
+
+ avg_objects = total_objects / used_slabs;
+ avg_used = total_used / used_slabs;
+ avg_partobj = total_partobj / used_slabs;
+ avg_ppart = total_ppart / used_slabs;
+ avg_ppartobj = total_ppartobj / used_slabs;
+
+ /* Per object object sizes */
+ avg_objsize = total_used / total_objects;
+ avg_objwaste = total_objwaste / total_objects;
+ avg_partobj = total_partobj * 100 / total_objects;
+ avg_memobj = total_objsize / total_objects;
+
+ printf("Slabcache Totals\n");
+ printf("----------------\n");
+ printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n",
+ slabs, aliases, alias_targets, used_slabs);
+
+ store_size(b1, total_size);store_size(b2, total_waste);
+ store_size(b3, total_waste * 100 / total_used);
+ printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3);
+
+ store_size(b1, total_objects);store_size(b2, total_partobj);
+ store_size(b3, total_partobj * 100 / total_objects);
+ printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3);
+
+ printf("\n");
+ printf("Per Cache Average "
+ "Min Max Total\n");
+ printf("---------------------------------------"
+ "-------------------------------------\n");
+
+ store_size(b1, avg_objects);store_size(b2, min_objects);
+ store_size(b3, max_objects);store_size(b4, total_objects);
+ printf("#Objects %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_slabs);store_size(b2, min_slabs);
+ store_size(b3, max_slabs);store_size(b4, total_slabs);
+ printf("#Slabs %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partial);store_size(b2, min_partial);
+ store_size(b3, max_partial);store_size(b4, total_partial);
+ printf("#PartSlab %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+ store_size(b1, avg_ppart);store_size(b2, min_ppart);
+ store_size(b3, max_ppart);
+ store_size(b4, total_partial * 100 / total_slabs);
+ printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partobj);store_size(b2, min_partobj);
+ store_size(b3, max_partobj);
+ store_size(b4, total_partobj);
+ printf("PartObjs %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
+ store_size(b3, max_ppartobj);
+ store_size(b4, total_partobj * 100 / total_objects);
+ printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_size);store_size(b2, min_size);
+ store_size(b3, max_size);store_size(b4, total_size);
+ printf("Memory %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_used);store_size(b2, min_used);
+ store_size(b3, max_used);store_size(b4, total_used);
+ printf("Used %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_waste);store_size(b2, min_waste);
+ store_size(b3, max_waste);store_size(b4, total_waste);
+ printf("Loss %15s %15s %15s %15s\n",
+ b1, b2, b3, b4);
+
+ printf("\n");
+ printf("Per Object Average "
+ "Min Max\n");
+ printf("---------------------------------------"
+ "--------------------\n");
+
+ store_size(b1, avg_memobj);store_size(b2, min_memobj);
+ store_size(b3, max_memobj);
+ printf("Memory %15s %15s %15s\n",
+ b1, b2, b3);
+ store_size(b1, avg_objsize);store_size(b2, min_objsize);
+ store_size(b3, max_objsize);
+ printf("User %15s %15s %15s\n",
+ b1, b2, b3);
+
+ store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
+ store_size(b3, max_objwaste);
+ printf("Loss %15s %15s %15s\n",
+ b1, b2, b3);
+}
+
+static void sort_slabs(void)
+{
+ struct slabinfo *s1,*s2;
+
+ for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
+ for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
+ int result;
+
+ if (sort_size) {
+ if (slab_size(s1) == slab_size(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_size(s1) < slab_size(s2);
+ } else if (sort_active) {
+ if (slab_activity(s1) == slab_activity(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_activity(s1) < slab_activity(s2);
+ } else if (sort_loss) {
+ if (slab_waste(s1) == slab_waste(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_waste(s1) < slab_waste(s2);
+ } else if (sort_partial) {
+ if (s1->partial == s2->partial)
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = s1->partial < s2->partial;
+ } else
+ result = strcasecmp(s1->name, s2->name);
+
+ if (show_inverted)
+ result = -result;
+
+ if (result > 0) {
+ struct slabinfo t;
+
+ memcpy(&t, s1, sizeof(struct slabinfo));
+ memcpy(s1, s2, sizeof(struct slabinfo));
+ memcpy(s2, &t, sizeof(struct slabinfo));
+ }
+ }
+ }
+}
+
+static void sort_aliases(void)
+{
+ struct aliasinfo *a1,*a2;
+
+ for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
+ for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
+ char *n1, *n2;
+
+ n1 = a1->name;
+ n2 = a2->name;
+ if (show_alias && !show_inverted) {
+ n1 = a1->ref;
+ n2 = a2->ref;
+ }
+ if (strcasecmp(n1, n2) > 0) {
+ struct aliasinfo t;
+
+ memcpy(&t, a1, sizeof(struct aliasinfo));
+ memcpy(a1, a2, sizeof(struct aliasinfo));
+ memcpy(a2, &t, sizeof(struct aliasinfo));
+ }
+ }
+ }
+}
+
+static void link_slabs(void)
+{
+ struct aliasinfo *a;
+ struct slabinfo *s;
+
+ for (a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ for (s = slabinfo; s < slabinfo + slabs; s++)
+ if (strcmp(a->ref, s->name) == 0) {
+ a->slab = s;
+ s->refs++;
+ break;
+ }
+ if (s == slabinfo + slabs)
+ fatal("Unresolved alias %s\n", a->ref);
+ }
+}
+
+static void alias(void)
+{
+ struct aliasinfo *a;
+ char *active = NULL;
+
+ sort_aliases();
+ link_slabs();
+
+ for(a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ if (!show_single_ref && a->slab->refs == 1)
+ continue;
+
+ if (!show_inverted) {
+ if (active) {
+ if (strcmp(a->slab->name, active) == 0) {
+ printf(" %s", a->name);
+ continue;
+ }
+ }
+ printf("\n%-12s <- %s", a->slab->name, a->name);
+ active = a->slab->name;
+ }
+ else
+ printf("%-15s -> %s\n", a->name, a->slab->name);
+ }
+ if (active)
+ printf("\n");
+}
+
+
+static void rename_slabs(void)
+{
+ struct slabinfo *s;
+ struct aliasinfo *a;
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ if (*s->name != ':')
+ continue;
+
+ if (s->refs > 1 && !show_first_alias)
+ continue;
+
+ a = find_one_alias(s);
+
+ if (a)
+ s->name = a->name;
+ else {
+ s->name = "*";
+ actual_slabs--;
+ }
+ }
+}
+
+static int slab_mismatch(char *slab)
+{
+ return regexec(&pattern, slab, 0, NULL, 0);
+}
+
+static void read_slab_dir(void)
+{
+ DIR *dir;
+ struct dirent *de;
+ struct slabinfo *slab = slabinfo;
+ struct aliasinfo *alias = aliasinfo;
+ char *p;
+ char *t;
+ int count;
+
+ if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
+ fatal("SYSFS support for SLUB not active\n");
+
+ dir = opendir(".");
+ while ((de = readdir(dir))) {
+ if (de->d_name[0] == '.' ||
+ (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
+ continue;
+ switch (de->d_type) {
+ case DT_LNK:
+ if (alias - aliasinfo == MAX_ALIASES)
+ fatal("Too many aliases\n");
+ alias->name = strdup(de->d_name);
+ count = readlink(de->d_name, buffer, sizeof(buffer)-1);
+
+ if (count < 0)
+ fatal("Cannot read symlink %s\n", de->d_name);
+
+ buffer[count] = 0;
+ p = buffer + count;
+ while (p > buffer && p[-1] != '/')
+ p--;
+ alias->ref = strdup(p);
+ alias++;
+ break;
+ case DT_DIR:
+ if (slab - slabinfo == MAX_SLABS)
+ fatal("Too many slabs\n");
+ if (chdir(de->d_name))
+ fatal("Unable to access slab %s\n", slab->name);
+ slab->name = strdup(de->d_name);
+ slab->alias = 0;
+ slab->refs = 0;
+ slab->aliases = get_obj("aliases");
+ slab->align = get_obj("align");
+ slab->cache_dma = get_obj("cache_dma");
+ slab->cpu_slabs = get_obj("cpu_slabs");
+ slab->destroy_by_rcu = get_obj("destroy_by_rcu");
+ slab->hwcache_align = get_obj("hwcache_align");
+ slab->object_size = get_obj("object_size");
+ slab->objects = get_obj("objects");
+ slab->objects_partial = get_obj("objects_partial");
+ slab->objects_total = get_obj("objects_total");
+ slab->objs_per_slab = get_obj("objs_per_slab");
+ slab->order = get_obj("order");
+ slab->partial = get_obj("partial");
+ slab->partial = get_obj_and_str("partial", &t);
+ decode_numa_list(slab->numa_partial, t);
+ free(t);
+ slab->poison = get_obj("poison");
+ slab->reclaim_account = get_obj("reclaim_account");
+ slab->red_zone = get_obj("red_zone");
+ slab->sanity_checks = get_obj("sanity_checks");
+ slab->slab_size = get_obj("slab_size");
+ slab->slabs = get_obj_and_str("slabs", &t);
+ decode_numa_list(slab->numa, t);
+ free(t);
+ slab->store_user = get_obj("store_user");
+ slab->trace = get_obj("trace");
+ slab->alloc_fastpath = get_obj("alloc_fastpath");
+ slab->alloc_slowpath = get_obj("alloc_slowpath");
+ slab->free_fastpath = get_obj("free_fastpath");
+ slab->free_slowpath = get_obj("free_slowpath");
+ slab->free_frozen= get_obj("free_frozen");
+ slab->free_add_partial = get_obj("free_add_partial");
+ slab->free_remove_partial = get_obj("free_remove_partial");
+ slab->alloc_from_partial = get_obj("alloc_from_partial");
+ slab->alloc_slab = get_obj("alloc_slab");
+ slab->alloc_refill = get_obj("alloc_refill");
+ slab->free_slab = get_obj("free_slab");
+ slab->cpuslab_flush = get_obj("cpuslab_flush");
+ slab->deactivate_full = get_obj("deactivate_full");
+ slab->deactivate_empty = get_obj("deactivate_empty");
+ slab->deactivate_to_head = get_obj("deactivate_to_head");
+ slab->deactivate_to_tail = get_obj("deactivate_to_tail");
+ slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
+ slab->order_fallback = get_obj("order_fallback");
+ slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail");
+ slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail");
+ slab->cpu_partial_alloc = get_obj("cpu_partial_alloc");
+ slab->cpu_partial_free = get_obj("cpu_partial_free");
+ slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
+ slab->deactivate_bypass = get_obj("deactivate_bypass");
+ if (chdir(".."))
+ fatal("Unable to chdir from slab ../%s\n",
+ slab->name);
+ if (slab->name[0] == ':')
+ alias_targets++;
+ slab++;
+ break;
+ default :
+ fatal("Unknown file type %lx\n", de->d_type);
+ }
+ }
+ closedir(dir);
+ slabs = slab - slabinfo;
+ actual_slabs = slabs;
+ aliases = alias - aliasinfo;
+}
+
+static void output_slabs(void)
+{
+ struct slabinfo *slab;
+ int lines = output_lines;
+
+ for (slab = slabinfo; (slab < slabinfo + slabs) &&
+ lines != 0; slab++) {
+
+ if (slab->alias)
+ continue;
+
+ if (lines != -1)
+ lines--;
+
+ if (show_numa)
+ slab_numa(slab, 0);
+ else if (show_track)
+ show_tracking(slab);
+ else if (validate)
+ slab_validate(slab);
+ else if (shrink)
+ slab_shrink(slab);
+ else if (set_debug)
+ slab_debug(slab);
+ else if (show_ops)
+ ops(slab);
+ else if (show_slab)
+ slabcache(slab);
+ else if (show_report)
+ report(slab);
+ }
+}
+
+static void _xtotals(char *heading, char *underline,
+ int loss, int size, int partial)
+{
+ printf("%s%s", heading, underline);
+ line = 0;
+ sort_loss = loss;
+ sort_size = size;
+ sort_partial = partial;
+ sort_slabs();
+ output_slabs();
+}
+
+static void xtotals(void)
+{
+ char *heading, *underline;
+
+ totals();
+
+ link_slabs();
+ rename_slabs();
+
+ heading = "\nSlabs sorted by size\n";
+ underline = "--------------------\n";
+ _xtotals(heading, underline, 0, 1, 0);
+
+ heading = "\nSlabs sorted by loss\n";
+ underline = "--------------------\n";
+ _xtotals(heading, underline, 1, 0, 0);
+
+ heading = "\nSlabs sorted by number of partial slabs\n";
+ underline = "---------------------------------------\n";
+ _xtotals(heading, underline, 0, 0, 1);
+
+ printf("\n");
+}
+
+struct option opts[] = {
+ { "aliases", no_argument, NULL, 'a' },
+ { "activity", no_argument, NULL, 'A' },
+ { "Bytes", no_argument, NULL, 'B'},
+ { "debug", optional_argument, NULL, 'd' },
+ { "display-activity", no_argument, NULL, 'D' },
+ { "empty", no_argument, NULL, 'e' },
+ { "first-alias", no_argument, NULL, 'f' },
+ { "help", no_argument, NULL, 'h' },
+ { "inverted", no_argument, NULL, 'i'},
+ { "slabs", no_argument, NULL, 'l' },
+ { "Loss", no_argument, NULL, 'L'},
+ { "numa", no_argument, NULL, 'n' },
+ { "lines", required_argument, NULL, 'N'},
+ { "ops", no_argument, NULL, 'o' },
+ { "partial", no_argument, NULL, 'p'},
+ { "report", no_argument, NULL, 'r' },
+ { "shrink", no_argument, NULL, 's' },
+ { "Size", no_argument, NULL, 'S'},
+ { "tracking", no_argument, NULL, 't'},
+ { "Totals", no_argument, NULL, 'T'},
+ { "Unreclaim", no_argument, NULL, 'U'},
+ { "validate", no_argument, NULL, 'v' },
+ { "Xtotals", no_argument, NULL, 'X'},
+ { "zero", no_argument, NULL, 'z' },
+ { "1ref", no_argument, NULL, '1'},
+ { NULL, 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+ int c;
+ int err;
+ char *pattern_source;
+
+ page_size = getpagesize();
+
+ while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1",
+ opts, NULL)) != -1)
+ switch (c) {
+ case 'a':
+ show_alias = 1;
+ break;
+ case 'A':
+ sort_active = 1;
+ break;
+ case 'B':
+ show_bytes = 1;
+ break;
+ case 'd':
+ set_debug = 1;
+ if (!debug_opt_scan(optarg))
+ fatal("Invalid debug option '%s'\n", optarg);
+ break;
+ case 'D':
+ show_activity = 1;
+ break;
+ case 'e':
+ show_empty = 1;
+ break;
+ case 'f':
+ show_first_alias = 1;
+ break;
+ case 'h':
+ usage();
+ return 0;
+ case 'i':
+ show_inverted = 1;
+ break;
+ case 'l':
+ show_slab = 1;
+ break;
+ case 'L':
+ sort_loss = 1;
+ break;
+ case 'n':
+ show_numa = 1;
+ break;
+ case 'N':
+ if (optarg) {
+ output_lines = atoi(optarg);
+ if (output_lines < 1)
+ output_lines = 1;
+ }
+ break;
+ case 'o':
+ show_ops = 1;
+ break;
+ case 'r':
+ show_report = 1;
+ break;
+ case 'P':
+ sort_partial = 1;
+ break;
+ case 's':
+ shrink = 1;
+ break;
+ case 'S':
+ sort_size = 1;
+ break;
+ case 't':
+ show_track = 1;
+ break;
+ case 'T':
+ show_totals = 1;
+ break;
+ case 'U':
+ unreclaim_only = 1;
+ break;
+ case 'v':
+ validate = 1;
+ break;
+ case 'X':
+ if (output_lines == -1)
+ output_lines = 1;
+ extended_totals = 1;
+ show_bytes = 1;
+ break;
+ case 'z':
+ skip_zero = 0;
+ break;
+ case '1':
+ show_single_ref = 1;
+ break;
+ default:
+ fatal("%s: Invalid option '%c'\n", argv[0], optopt);
+
+ }
+
+ if (!show_slab && !show_alias && !show_track && !show_report
+ && !validate && !shrink && !set_debug && !show_ops)
+ show_slab = 1;
+
+ if (argc > optind)
+ pattern_source = argv[optind];
+ else
+ pattern_source = ".*";
+
+ err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
+ if (err)
+ fatal("%s: Invalid pattern '%s' code %d\n",
+ argv[0], pattern_source, err);
+ read_slab_dir();
+ if (show_alias) {
+ alias();
+ } else if (extended_totals) {
+ xtotals();
+ } else if (show_totals) {
+ totals();
+ } else {
+ link_slabs();
+ rename_slabs();
+ sort_slabs();
+ output_slabs();
+ }
+ return 0;
+}
diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c
new file mode 100644
index 000000000000..83afc52275a5
--- /dev/null
+++ b/tools/mm/thp_swap_allocator_test.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * thp_swap_allocator_test
+ *
+ * The purpose of this test program is helping check if THP swpout
+ * can correctly get swap slots to swap out as a whole instead of
+ * being split. It randomly releases swap entries through madvise
+ * DONTNEED and swapin/out on two memory areas: a memory area for
+ * 64KB THP and the other area for small folios. The second memory
+ * can be enabled by "-s".
+ * Before running the program, we need to setup a zRAM or similar
+ * swap device by:
+ * echo lzo > /sys/block/zram0/comp_algorithm
+ * echo 64M > /sys/block/zram0/disksize
+ * echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+ * echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
+ * mkswap /dev/zram0
+ * swapon /dev/zram0
+ * The expected result should be 0% anon swpout fallback ratio w/ or
+ * w/o "-s".
+ *
+ * Author(s): Barry Song <v-songbaohua@oppo.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <time.h>
+
+#define MEMSIZE_MTHP (60 * 1024 * 1024)
+#define MEMSIZE_SMALLFOLIO (4 * 1024 * 1024)
+#define ALIGNMENT_MTHP (64 * 1024)
+#define ALIGNMENT_SMALLFOLIO (4 * 1024)
+#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
+#define TOTAL_DONTNEED_SMALLFOLIO (1 * 1024 * 1024)
+#define MTHP_FOLIO_SIZE (64 * 1024)
+
+#define SWPOUT_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
+#define SWPOUT_FALLBACK_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
+
+static void *aligned_alloc_mem(size_t size, size_t alignment)
+{
+ void *mem = NULL;
+
+ if (posix_memalign(&mem, alignment, size) != 0) {
+ perror("posix_memalign");
+ return NULL;
+ }
+ return mem;
+}
+
+/*
+ * This emulates the behavior of native libc and Java heap,
+ * as well as process exit and munmap. It helps generate mTHP
+ * and ensures that iterations can proceed with mTHP, as we
+ * currently don't support large folios swap-in.
+ */
+static void random_madvise_dontneed(void *mem, size_t mem_size,
+ size_t align_size, size_t total_dontneed_size)
+{
+ size_t num_pages = total_dontneed_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ if (madvise(addr, align_size, MADV_DONTNEED) != 0)
+ perror("madvise dontneed");
+
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static void random_swapin(void *mem, size_t mem_size,
+ size_t align_size, size_t total_swapin_size)
+{
+ size_t num_pages = total_swapin_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static unsigned long read_stat(const char *path)
+{
+ FILE *file;
+ unsigned long value;
+
+ file = fopen(path, "r");
+ if (!file) {
+ perror("fopen");
+ return 0;
+ }
+
+ if (fscanf(file, "%lu", &value) != 1) {
+ perror("fscanf");
+ fclose(file);
+ return 0;
+ }
+
+ fclose(file);
+ return value;
+}
+
+int main(int argc, char *argv[])
+{
+ int use_small_folio = 0, aligned_swapin = 0;
+ void *mem1 = NULL, *mem2 = NULL;
+ int i;
+
+ for (i = 1; i < argc; ++i) {
+ if (strcmp(argv[i], "-s") == 0)
+ use_small_folio = 1;
+ else if (strcmp(argv[i], "-a") == 0)
+ aligned_swapin = 1;
+ }
+
+ mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
+ if (mem1 == NULL) {
+ fprintf(stderr, "Failed to allocate large folios memory\n");
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
+ perror("madvise hugepage for mem1");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
+ if (mem2 == NULL) {
+ fprintf(stderr, "Failed to allocate small folios memory\n");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
+ perror("madvise nohugepage for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* warm-up phase to occupy the swapfile */
+ memset(mem1, 0x11, MEMSIZE_MTHP);
+ madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT);
+ if (use_small_folio) {
+ memset(mem2, 0x11, MEMSIZE_SMALLFOLIO);
+ madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT);
+ }
+
+ /* iterations with newly created mTHP, swap-in, and swap-out */
+ for (i = 0; i < 100; ++i) {
+ unsigned long initial_swpout;
+ unsigned long initial_swpout_fallback;
+ unsigned long final_swpout;
+ unsigned long final_swpout_fallback;
+ unsigned long swpout_inc;
+ unsigned long swpout_fallback_inc;
+ double fallback_percentage;
+
+ initial_swpout = read_stat(SWPOUT_PATH);
+ initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ /*
+ * The following setup creates a 1:1 ratio of mTHP to small folios
+ * since large folio swap-in isn't supported yet. Once we support
+ * mTHP swap-in, we'll likely need to reduce MEMSIZE_MTHP and
+ * increase MEMSIZE_SMALLFOLIO to maintain the ratio.
+ */
+ random_swapin(mem1, MEMSIZE_MTHP,
+ aligned_swapin ? ALIGNMENT_MTHP : ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_MTHP);
+ random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
+ TOTAL_DONTNEED_MTHP);
+
+ if (use_small_folio) {
+ random_swapin(mem2, MEMSIZE_SMALLFOLIO,
+ ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_SMALLFOLIO);
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem1");
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ final_swpout = read_stat(SWPOUT_PATH);
+ final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ swpout_inc = final_swpout - initial_swpout;
+ swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
+
+ fallback_percentage = (double)swpout_fallback_inc /
+ (swpout_fallback_inc + swpout_inc) * 100;
+
+ printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
+ i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
+ }
+
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 000000000000..803e0318f2fe
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+ PAGE_SIZE = resource.getpagesize()
+ PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+ PMD_SIZE = int(f.read())
+ PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+ return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+ return v & (a - 1)
+
+
+def kbnr(kb):
+ # Convert KB to number of pages.
+ return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+ # Convert number of pages to KB.
+ return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+ # Convert page order to KB.
+ return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+ # Given a list of arrays, find the ranges for which values are monotonically
+ # incrementing in all arrays. all arrays in search and index must be the
+ # same size.
+ sz = len(search[0])
+ r = np.full(sz, 2)
+ d = np.diff(search[0]) == 1
+ for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+ d &= dd
+ r[1:] -= d
+ r[:-1] -= d
+ return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+ pass
+
+
+class FileIOException(Exception):
+ pass
+
+
+class BinArrayFile:
+ # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+ # numpy array. Use inherrited class in a with clause to ensure file is
+ # closed when it goes out of scope.
+ def __init__(self, filename, element_size):
+ self.element_size = element_size
+ self.filename = filename
+ self.fd = os.open(self.filename, os.O_RDONLY)
+
+ def cleanup(self):
+ os.close(self.fd)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.cleanup()
+
+ def _readin(self, offset, buffer):
+ length = os.preadv(self.fd, (buffer,), offset)
+ if len(buffer) != length:
+ raise FileIOException('error: {} failed to read {} bytes at {:x}'
+ .format(self.filename, len(buffer), offset))
+
+ def _toarray(self, buf):
+ assert(self.element_size == 8)
+ return np.frombuffer(buf, dtype=np.uint64)
+
+ def getv(self, vec):
+ vec *= self.element_size
+ offsets = vec[:, 0]
+ lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+ buf = bytearray(int(np.sum(lengths)))
+ view = memoryview(buf)
+ pos = 0
+ for offset, length in zip(offsets, lengths):
+ offset = int(offset)
+ length = int(length)
+ self._readin(offset, view[pos:pos+length])
+ pos += length
+ return self._toarray(buf)
+
+ def get(self, index, nr=1):
+ offset = index * self.element_size
+ length = nr * self.element_size
+ buf = bytearray(length)
+ self._readin(offset, buf)
+ return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+ # Read ranges of a given pid's pagemap into a numpy array.
+ def __init__(self, pid='self'):
+ super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+ # Read ranges of /proc/kpageflags into a numpy array.
+ def __init__(self):
+ super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+ "Size",
+ "Rss",
+ "Pss",
+ "Pss_Dirty",
+ "Shared_Clean",
+ "Shared_Dirty",
+ "Private_Clean",
+ "Private_Dirty",
+ "Referenced",
+ "Anonymous",
+ "KSM",
+ "LazyFree",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+ "Shared_Hugetlb",
+ "Private_Hugetlb",
+ "Swap",
+ "SwapPss",
+ "Locked",
+])
+
+vma_min_stats = set([
+ "Rss",
+ "Anonymous",
+ "AnonHugePages",
+ "ShmemPmdMapped",
+ "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+ 'name',
+ 'start',
+ 'end',
+ 'read',
+ 'write',
+ 'execute',
+ 'private',
+ 'pgoff',
+ 'major',
+ 'minor',
+ 'inode',
+ 'stats',
+])
+
+class VMAList:
+ # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+ # instance to receive VMAs.
+ def __init__(self, pid='self', stats=[]):
+ self.vmas = []
+ with open(f'/proc/{pid}/smaps', 'r') as file:
+ for line in file:
+ elements = line.split()
+ if '-' in elements[0]:
+ start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+ major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+ self.vmas.append(VMA(
+ name=elements[5] if len(elements) == 6 else '',
+ start=start,
+ end=end,
+ read=elements[1][0] == 'r',
+ write=elements[1][1] == 'w',
+ execute=elements[1][2] == 'x',
+ private=elements[1][3] == 'p',
+ pgoff=int(elements[2], 16),
+ major=major,
+ minor=minor,
+ inode=int(elements[4], 16),
+ stats={},
+ ))
+ else:
+ param = elements[0][:-1]
+ if param in stats:
+ value = int(elements[1])
+ self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+ def __iter__(self):
+ yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the mapped THPs.
+ stats = {
+ 'file': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ 'anon': {
+ 'partial': 0,
+ 'aligned': [0] * (PMD_ORDER + 1),
+ 'unaligned': [0] * (PMD_ORDER + 1),
+ },
+ }
+
+ for rindex, rpfn in zip(ranges[0], ranges[2]):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ pfn_end = int(rpfn[1]) + 1
+
+ folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+ # Account pages for any partially mapped THP at the front. In that case,
+ # the first page of the range is a tail.
+ nr = (int(folios[0]) if len(folios) else index_end) - index_next
+ stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+ # Account pages for any partially mapped THP at the back. In that case,
+ # the next page after the range is a tail.
+ if len(folios):
+ flags = int(kpageflags.get(pfn_end)[0])
+ if flags & KPF_COMPOUND_TAIL:
+ nr = index_end - int(folios[-1])
+ folios = folios[:-1]
+ index_end -= nr
+ stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+ # Account fully mapped THPs in the middle of the range.
+ if len(folios):
+ folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+ folio_orders = np.log2(folio_nrs).astype(np.uint64)
+ for index, order in zip(folios, folio_orders):
+ index = int(index)
+ order = int(order)
+ nr = 1 << order
+ vfn = int(vfns[index])
+ align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+ anon = 'anon' if anons[index] else 'file'
+ stats[anon][align][order] += nr
+
+ # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+ # race between acquiring the smaps stats and reading pagemap, where memory
+ # could be deallocated. So clamp to zero incase it would have gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+ stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ def flatten_sub(type, subtype, stats):
+ param = f"{type}-thp-pte-{subtype}-{{}}kB"
+ for od, nr in enumerate(stats[2:], 2):
+ rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+ def flatten_type(type, stats):
+ flatten_sub(type, 'aligned', stats['aligned'])
+ flatten_sub(type, 'unaligned', stats['unaligned'])
+ rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+ flatten_type('anon', stats['anon'])
+ flatten_type('file', stats['file'])
+
+ return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+ # Given 4 same-sized arrays representing a range within a page table backed
+ # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+ # True if page is anonymous, heads: True if page is head of a THP), return a
+ # dictionary of statistics describing the contiguous blocks.
+ nr_cont = 1 << order
+ nr_anon = 0
+ nr_file = 0
+
+ for rindex, rvfn, rpfn in zip(*ranges):
+ index_next = int(rindex[0])
+ index_end = int(rindex[1]) + 1
+ vfn_start = int(rvfn[0])
+ pfn_start = int(rpfn[0])
+
+ if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+ continue
+
+ off = align_forward(vfn_start, nr_cont) - vfn_start
+ index_next += off
+
+ while index_next + nr_cont <= index_end:
+ folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+ if not folio_boundary:
+ if anons[index_next]:
+ nr_anon += nr_cont
+ else:
+ nr_file += nr_cont
+ index_next += nr_cont
+
+ # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+ # There is a race between acquiring the smaps stats and reading pagemap,
+ # where memory could be deallocated. So clamp to zero incase it would have
+ # gone negative.
+ anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+ file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+ vma.stats['FilePmdMapped']['value']
+ nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+ nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+ rstats = {
+ f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+ f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+ }
+
+ rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+ rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+ return rstats
+
+
+def vma_print(vma, pid):
+ # Prints a VMA instance in a format similar to smaps. The main difference is
+ # that the pid is included as the first value.
+ print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+ .format(
+ pid, vma.start, vma.end,
+ 'r' if vma.read else '-', 'w' if vma.write else '-',
+ 'x' if vma.execute else '-', 'p' if vma.private else 's',
+ vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+ ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+ # Print a statistics dictionary.
+ label_field = 32
+ for label, stat in stats.items():
+ type = stat['type']
+ value = stat['value']
+ if value or inc_empty:
+ pad = max(0, label_field - len(label) - 1)
+ if type == 'anon' and tot_anon > 0:
+ percent = f' ({value / tot_anon:3.0%})'
+ elif type == 'file' and tot_file > 0:
+ percent = f' ({value / tot_file:3.0%})'
+ else:
+ percent = ''
+ print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+ # Generate thp and cont statistics for a single VMA.
+ start = vma.start >> PAGE_SHIFT
+ end = vma.end >> PAGE_SHIFT
+
+ pmes = pagemap.get(start, end - start)
+ present = pmes & PM_PAGE_PRESENT != 0
+ pfns = pmes & PM_PFN_MASK
+ pfns = pfns[present]
+ vfns = np.arange(start, end, dtype=np.uint64)
+ vfns = vfns[present]
+
+ pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+ flags = kpageflags.getv(pfn_vec)
+ anons = flags & KPF_ANON != 0
+ heads = flags & KPF_COMPOUND_HEAD != 0
+ thps = flags & KPF_THP != 0
+
+ vfns = vfns[thps]
+ pfns = pfns[thps]
+ anons = anons[thps]
+ heads = heads[thps]
+
+ indexes = np.arange(len(vfns), dtype=np.uint64)
+ ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+ thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+ contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+ tot_anon = vma.stats['Anonymous']['value']
+ tot_file = vma.stats['Rss']['value'] - tot_anon
+
+ return {
+ **thpstats,
+ **{k: v for s in contstats for k, v in s.items()}
+ }, tot_anon, tot_file
+
+
+def do_main(args):
+ pids = set()
+ rollup = {}
+ rollup_anon = 0
+ rollup_file = 0
+
+ if args.cgroup:
+ strict = False
+ for walk_info in os.walk(args.cgroup):
+ cgroup = walk_info[0]
+ with open(f'{cgroup}/cgroup.procs') as pidfile:
+ for line in pidfile.readlines():
+ pids.add(int(line.strip()))
+ elif args.pid:
+ strict = True
+ pids = pids.union(args.pid)
+ else:
+ strict = False
+ for pid in os.listdir('/proc'):
+ if pid.isdigit():
+ pids.add(int(pid))
+
+ if not args.rollup:
+ print(" PID START END PROT OFFSET DEV INODE OBJECT")
+
+ for pid in pids:
+ try:
+ with PageMap(pid) as pagemap:
+ with KPageFlags() as kpageflags:
+ for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+ if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+ stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+ else:
+ stats = {}
+ vma_anon = 0
+ vma_file = 0
+ if args.inc_smaps:
+ stats = {**vma.stats, **stats}
+ if args.rollup:
+ for k, v in stats.items():
+ if k in rollup:
+ assert(rollup[k]['type'] == v['type'])
+ rollup[k]['value'] += v['value']
+ else:
+ rollup[k] = v
+ rollup_anon += vma_anon
+ rollup_file += vma_file
+ else:
+ vma_print(vma, pid)
+ stats_print(stats, vma_anon, vma_file, args.inc_empty)
+ except (FileNotFoundError, ProcessLookupError, FileIOException):
+ if strict:
+ raise
+
+ if args.rollup:
+ stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+ docs_width = shutil.get_terminal_size().columns
+ docs_width -= 2
+ docs_width = min(80, docs_width)
+
+ def format(string):
+ text = re.sub(r'\s+', ' ', string)
+ text = re.sub(r'\s*\\n\s*', '\n', text)
+ paras = text.split('\n')
+ paras = [textwrap.fill(p, width=docs_width) for p in paras]
+ return '\n'.join(paras)
+
+ def formatter(prog):
+ return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+ def size2order(human):
+ units = {
+ "K": 2**10, "M": 2**20, "G": 2**30,
+ "k": 2**10, "m": 2**20, "g": 2**30,
+ }
+ unit = 1
+ if human[-1] in units:
+ unit = units[human[-1]]
+ human = human[:-1]
+ try:
+ size = int(human)
+ except ValueError:
+ raise ArgException('error: --cont value must be integer size with optional KMG unit')
+ size *= unit
+ order = int(math.log2(size / PAGE_SIZE))
+ if order < 1:
+ raise ArgException('error: --cont value must be size of at least 2 pages')
+ if (1 << order) * PAGE_SIZE != size:
+ raise ArgException('error: --cont value must be size of power-of-2 pages')
+ if order > PMD_ORDER:
+ raise ArgException('error: --cont value must be less than or equal to PMD order')
+ return order
+
+ parser = argparse.ArgumentParser(formatter_class=formatter,
+ description=format("""Prints information about how transparent huge
+ pages are mapped, either system-wide, or for a specified
+ process or cgroup.\\n
+ \\n
+ When run with --pid, the user explicitly specifies the set
+ of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+ with --cgroup, the user passes either a v1 or v2 cgroup and
+ all pids that belong to the cgroup subtree are scanned. When
+ run with neither --pid nor --cgroup, the full set of pids on
+ the system is gathered from /proc and scanned as if the user
+ had provided "--pid 1 --pid 2 ...".\\n
+ \\n
+ A default set of statistics is always generated for THP
+ mappings. However, it is also possible to generate
+ additional statistics for "contiguous block mappings" where
+ the block size is user-defined.\\n
+ \\n
+ Statistics are maintained independently for anonymous and
+ file-backed (pagecache) memory and are shown both in kB and
+ as a percentage of either total anonymous or total
+ file-backed memory as appropriate.\\n
+ \\n
+ THP Statistics\\n
+ --------------\\n
+ \\n
+ Statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is aligned to
+ their size, for each <size> supported by the system.
+ Separate counters describe THPs mapped by PTE vs those
+ mapped by PMD. (Although note a THP can only be mapped by
+ PMD if it is PMD-sized):\\n
+ \\n
+ - anon-thp-pte-aligned-<size>kB\\n
+ - file-thp-pte-aligned-<size>kB\\n
+ - anon-thp-pmd-aligned-<size>kB\\n
+ - file-thp-pmd-aligned-<size>kB\\n
+ \\n
+ Similarly, statistics are always generated for fully- and
+ contiguously-mapped THPs whose mapping address is *not*
+ aligned to their size, for each <size> supported by the
+ system. Due to the unaligned mapping, it is impossible to
+ map by PMD, so there are only PTE counters for this case:\\n
+ \\n
+ - anon-thp-pte-unaligned-<size>kB\\n
+ - file-thp-pte-unaligned-<size>kB\\n
+ \\n
+ Statistics are also always generated for mapped pages that
+ belong to a THP but where the is THP is *not* fully- and
+ contiguously- mapped. These "partial" mappings are all
+ counted in the same counter regardless of the size of the
+ THP that is partially mapped:\\n
+ \\n
+ - anon-thp-pte-partial\\n
+ - file-thp-pte-partial\\n
+ \\n
+ Contiguous Block Statistics\\n
+ ---------------------------\\n
+ \\n
+ An optional, additional set of statistics is generated for
+ every contiguous block size specified with `--cont <size>`.
+ These statistics show how much memory is mapped in
+ contiguous blocks of <size> and also aligned to <size>. A
+ given contiguous block must all belong to the same THP, but
+ there is no requirement for it to be the *whole* THP.
+ Separate counters describe contiguous blocks mapped by PTE
+ vs those mapped by PMD:\\n
+ \\n
+ - anon-cont-pte-aligned-<size>kB\\n
+ - file-cont-pte-aligned-<size>kB\\n
+ - anon-cont-pmd-aligned-<size>kB\\n
+ - file-cont-pmd-aligned-<size>kB\\n
+ \\n
+ As an example, if monitoring 64K contiguous blocks (--cont
+ 64K), there are a number of sources that could provide such
+ blocks: a fully- and contiguously-mapped 64K THP that is
+ aligned to a 64K boundary would provide 1 block. A fully-
+ and contiguously-mapped 128K THP that is aligned to at least
+ a 64K boundary would provide 2 blocks. Or a 128K THP that
+ maps its first 100K, but contiguously and starting at a 64K
+ boundary would provide 1 block. A fully- and
+ contiguously-mapped 2M THP would provide 32 blocks. There
+ are many other possible permutations.\\n"""),
+ epilog=format("""Requires root privilege to access pagemap and
+ kpageflags."""))
+
+ group = parser.add_mutually_exclusive_group(required=False)
+ group.add_argument('--pid',
+ metavar='pid', required=False, type=int, default=[], action='append',
+ help="""Process id of the target process. Maybe issued multiple times to
+ scan multiple processes. --pid and --cgroup are mutually exclusive.
+ If neither are provided, all processes are scanned to provide
+ system-wide information.""")
+
+ group.add_argument('--cgroup',
+ metavar='path', required=False,
+ help="""Path to the target cgroup in sysfs. Iterates over every pid in
+ the cgroup and its children. --pid and --cgroup are mutually
+ exclusive. If neither are provided, all processes are scanned to
+ provide system-wide information.""")
+
+ parser.add_argument('--rollup',
+ required=False, default=False, action='store_true',
+ help="""Sum the per-vma statistics to provide a summary over the whole
+ system, process or cgroup.""")
+
+ parser.add_argument('--cont',
+ metavar='size[KMG]', required=False, default=[], action='append',
+ help="""Adds stats for memory that is mapped in contiguous blocks of
+ <size> and also aligned to <size>. May be issued multiple times to
+ track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+ hpa mappings. Size must be a power-of-2 number of pages.""")
+
+ parser.add_argument('--inc-smaps',
+ required=False, default=False, action='store_true',
+ help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+ output.""")
+
+ parser.add_argument('--inc-empty',
+ required=False, default=False, action='store_true',
+ help="""Show all statistics including those whose value is 0.""")
+
+ parser.add_argument('--periodic',
+ metavar='sleep_ms', required=False, type=int,
+ help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+ args = parser.parse_args()
+
+ try:
+ args.cont = [size2order(cont) for cont in args.cont]
+ except ArgException as e:
+ parser.print_usage()
+ raise
+
+ if args.periodic:
+ while True:
+ do_main(args)
+ print()
+ time.sleep(args.periodic / 1000)
+ else:
+ do_main(args)
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except Exception as e:
+ prog = os.path.basename(sys.argv[0])
+ print(f'{prog}: {e}')
+ exit(1)