diff options
Diffstat (limited to 'tools/testing/selftests/cgroup')
19 files changed, 3679 insertions, 488 deletions
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index c4a57e69f749..952e4448bf07 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -1,8 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only -test_memcontrol test_core +test_cpu +test_cpuset test_freezer -test_kmem +test_hugetlb_memcg test_kill -test_cpu +test_kmem +test_memcontrol +test_pids +test_zswap wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 3d263747d2ad..e01584c2189a 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -4,22 +4,32 @@ CFLAGS += -Wall -pthread all: ${HELPER_PROGS} TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify -TEST_GEN_PROGS = test_memcontrol -TEST_GEN_PROGS += test_kmem -TEST_GEN_PROGS += test_core +# Keep the lists lexicographically sorted +TEST_GEN_PROGS = test_core +TEST_GEN_PROGS += test_cpu +TEST_GEN_PROGS += test_cpuset TEST_GEN_PROGS += test_freezer +TEST_GEN_PROGS += test_hugetlb_memcg TEST_GEN_PROGS += test_kill -TEST_GEN_PROGS += test_cpu +TEST_GEN_PROGS += test_kmem +TEST_GEN_PROGS += test_memcontrol +TEST_GEN_PROGS += test_pids +TEST_GEN_PROGS += test_zswap LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h include ../lib.mk +include lib/libcgroup.mk -$(OUTPUT)/test_memcontrol: cgroup_util.c -$(OUTPUT)/test_kmem: cgroup_util.c -$(OUTPUT)/test_core: cgroup_util.c -$(OUTPUT)/test_freezer: cgroup_util.c -$(OUTPUT)/test_kill: cgroup_util.c -$(OUTPUT)/test_cpu: cgroup_util.c +$(OUTPUT)/test_core: $(LIBCGROUP_O) +$(OUTPUT)/test_cpu: $(LIBCGROUP_O) +$(OUTPUT)/test_cpuset: $(LIBCGROUP_O) +$(OUTPUT)/test_freezer: $(LIBCGROUP_O) +$(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O) +$(OUTPUT)/test_kill: $(LIBCGROUP_O) +$(OUTPUT)/test_kmem: $(LIBCGROUP_O) +$(OUTPUT)/test_memcontrol: $(LIBCGROUP_O) +$(OUTPUT)/test_pids: $(LIBCGROUP_O) +$(OUTPUT)/test_zswap: $(LIBCGROUP_O) diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 97d549ee894f..39f979690dd3 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -3,5 +3,4 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y -CONFIG_MEMCG_KMEM=y CONFIG_PAGE_COUNTER=y diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index e8bbbdb77e0d..44c52f620fda 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -17,10 +17,12 @@ #include <unistd.h> #include "cgroup_util.h" -#include "../clone3/clone3_selftests.h" +#include "../../clone3/clone3_selftests.h" + +bool cg_test_v1_named; /* Returns read len on success, or -errno on failure. */ -static ssize_t read_text(const char *path, char *buf, size_t max_len) +ssize_t read_text(const char *path, char *buf, size_t max_len) { ssize_t len; int fd; @@ -39,7 +41,7 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len) } /* Returns written len on success, or -errno on failure. */ -static ssize_t write_text(const char *path, char *buf, ssize_t len) +ssize_t write_text(const char *path, char *buf, ssize_t len) { int fd; @@ -141,6 +143,16 @@ long cg_read_long(const char *cgroup, const char *control) return atol(buf); } +long cg_read_long_fd(int fd) +{ + char buf[128]; + + if (pread(fd, buf, sizeof(buf), 0) <= 0) + return -1; + + return atol(buf); +} + long cg_read_key_long(const char *cgroup, const char *control, const char *key) { char buf[PAGE_SIZE]; @@ -183,6 +195,18 @@ int cg_write(const char *cgroup, const char *control, char *buf) return ret == len ? 0 : ret; } +/* + * Returns fd on success, or -1 on failure. + * (fd should be closed with close() as usual) + */ +int cg_open(const char *cgroup, const char *control, int flags) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s", cgroup, control); + return open(path, flags); +} + int cg_write_numeric(const char *cgroup, const char *control, long value) { char buf[64]; @@ -195,10 +219,11 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) return cg_write(cgroup, control, buf); } -int cg_find_unified_root(char *root, size_t len) +static int cg_find_root(char *root, size_t len, const char *controller, + bool *nsdelegate) { char buf[10 * PAGE_SIZE]; - char *fs, *mount, *type; + char *fs, *mount, *type, *options; const char delim[] = "\n\t "; if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) @@ -211,19 +236,40 @@ int cg_find_unified_root(char *root, size_t len) for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { mount = strtok(NULL, delim); type = strtok(NULL, delim); + options = strtok(NULL, delim); strtok(NULL, delim); strtok(NULL, delim); - strtok(NULL, delim); - - if (strcmp(type, "cgroup2") == 0) { - strncpy(root, mount, len); - return 0; + if (strcmp(type, "cgroup") == 0) { + if (!controller || !strstr(options, controller)) + continue; + } else if (strcmp(type, "cgroup2") == 0) { + if (controller && + cg_read_strstr(mount, "cgroup.controllers", controller)) + continue; + } else { + continue; } + strncpy(root, mount, len); + + if (nsdelegate) + *nsdelegate = !!strstr(options, "nsdelegate"); + return 0; + } return -1; } +int cg_find_controller_root(char *root, size_t len, const char *controller) +{ + return cg_find_root(root, len, controller, NULL); +} + +int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) +{ + return cg_find_root(root, len, NULL, nsdelegate); +} + int cg_create(const char *cgroup) { return mkdir(cgroup, 0755); @@ -286,6 +332,8 @@ int cg_destroy(const char *cgroup) { int ret; + if (!cgroup) + return 0; retry: ret = rmdir(cgroup); if (ret && errno == EBUSY) { @@ -315,7 +363,7 @@ int cg_enter_current(const char *cgroup) int cg_enter_current_thread(const char *cgroup) { - return cg_write(cgroup, "cgroup.threads", "0"); + return cg_write(cgroup, CG_THREADS_FILE, "0"); } int cg_run(const char *cgroup, @@ -462,94 +510,28 @@ int cg_run_nowait(const char *cgroup, return pid; } -int get_temp_fd(void) -{ - return open(".", O_TMPFILE | O_RDWR | O_EXCL); -} - -int alloc_pagecache(int fd, size_t size) -{ - char buf[PAGE_SIZE]; - struct stat st; - int i; - - if (fstat(fd, &st)) - goto cleanup; - - size += st.st_size; - - if (ftruncate(fd, size)) - goto cleanup; - - for (i = 0; i < size; i += sizeof(buf)) - read(fd, buf, sizeof(buf)); - - return 0; - -cleanup: - return -1; -} - -int alloc_anon(const char *cgroup, void *arg) +int proc_mount_contains(const char *option) { - size_t size = (unsigned long)arg; - char *buf, *ptr; + char buf[4 * PAGE_SIZE]; + ssize_t read; - buf = malloc(size); - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 0; + read = read_text("/proc/mounts", buf, sizeof(buf)); + if (read < 0) + return read; - free(buf); - return 0; + return strstr(buf, option) != NULL; } -int is_swap_enabled(void) +int cgroup_feature(const char *feature) { char buf[PAGE_SIZE]; - const char delim[] = "\n"; - int cnt = 0; - char *line; - - if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) - return -1; - - for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) - cnt++; - - return cnt > 1; -} - -int set_oom_adj_score(int pid, int score) -{ - char path[PATH_MAX]; - int fd, len; - - sprintf(path, "/proc/%d/oom_score_adj", pid); - - fd = open(path, O_WRONLY | O_APPEND); - if (fd < 0) - return fd; - - len = dprintf(fd, "%d", score); - if (len < 0) { - close(fd); - return len; - } - - close(fd); - return 0; -} - -int proc_mount_contains(const char *option) -{ - char buf[4 * PAGE_SIZE]; ssize_t read; - read = read_text("/proc/mounts", buf, sizeof(buf)); + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); if (read < 0) return read; - return strstr(buf, option) != NULL; + return strstr(buf, feature) != NULL; } ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c92df4e5d395..7ab2824ed7b5 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -2,24 +2,54 @@ #include <stdbool.h> #include <stdlib.h> -#include "../kselftest.h" - +#ifndef PAGE_SIZE #define PAGE_SIZE 4096 +#endif #define MB(x) (x << 20) #define USEC_PER_SEC 1000000L #define NSEC_PER_SEC 1000000000L +#define TEST_UID 65534 /* usually nobody, any !root is fine */ + +#define CG_THREADS_FILE (!cg_test_v1_named ? "cgroup.threads" : "tasks") +#define CG_NAMED_NAME "selftest" +#define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) + /* * Checks if two given values differ by less than err% of their sum. */ static inline int values_close(long a, long b, int err) { - return abs(a - b) <= (a + b) / 100 * err; + return labs(a - b) <= (a + b) / 100 * err; } -extern int cg_find_unified_root(char *root, size_t len); +/* + * Checks if two given values differ by less than err% of their sum and assert + * with detailed debug info if not. + */ +static inline int values_close_report(long a, long b, int err) +{ + long diff = labs(a - b); + long limit = (a + b) / 100 * err; + double actual_err = (a + b) ? (100.0 * diff / (a + b)) : 0.0; + int close = diff <= limit; + + if (!close) + fprintf(stderr, + "[FAIL] actual=%ld expected=%ld | diff=%ld | limit=%ld | " + "tolerance=%d%% | actual_error=%.2f%%\n", + a, b, diff, limit, err, actual_err); + + return close; +} + +extern ssize_t read_text(const char *path, char *buf, size_t max_len); +extern ssize_t write_text(const char *path, char *buf, ssize_t len); + +extern int cg_find_controller_root(char *root, size_t len, const char *controller); +extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); extern char *cg_name(const char *root, const char *name); extern char *cg_name_indexed(const char *root, const char *name, int index); extern char *cg_control(const char *cgroup, const char *control); @@ -32,9 +62,11 @@ extern int cg_read_strcmp(const char *cgroup, const char *control, extern int cg_read_strstr(const char *cgroup, const char *control, const char *needle); extern long cg_read_long(const char *cgroup, const char *control); +extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); +extern int cg_open(const char *cgroup, const char *control, int flags); int cg_write_numeric(const char *cgroup, const char *control, long value); extern int cg_run(const char *cgroup, int (*fn)(const char *cgroup, void *arg), @@ -45,14 +77,10 @@ extern int cg_enter_current_thread(const char *cgroup); extern int cg_run_nowait(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg); -extern int get_temp_fd(void); -extern int alloc_pagecache(int fd, size_t size); -extern int alloc_anon(const char *cgroup, void *arg); -extern int is_swap_enabled(void); -extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); @@ -62,3 +90,4 @@ extern int dirfd_open_opath(const char *dir); extern int cg_prepare_for_wait(const char *cgroup); extern int memcg_prepare_for_wait(const char *cgroup); extern int cg_wait_for(int fd); +extern bool cg_test_v1_named; diff --git a/tools/testing/selftests/cgroup/lib/libcgroup.mk b/tools/testing/selftests/cgroup/lib/libcgroup.mk new file mode 100644 index 000000000000..7a73007204c3 --- /dev/null +++ b/tools/testing/selftests/cgroup/lib/libcgroup.mk @@ -0,0 +1,19 @@ +CGROUP_DIR := $(selfdir)/cgroup + +LIBCGROUP_C := lib/cgroup_util.c + +LIBCGROUP_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBCGROUP_C)) + +LIBCGROUP_O_DIRS := $(shell dirname $(LIBCGROUP_O) | uniq) + +CFLAGS += -I$(CGROUP_DIR)/lib/include + +EXTRA_HDRS := $(selfdir)/clone3/clone3_selftests.h + +$(LIBCGROUP_O_DIRS): + mkdir -p $@ + +$(LIBCGROUP_O): $(OUTPUT)/%.o : $(CGROUP_DIR)/%.c $(EXTRA_HDRS) $(LIBCGROUP_O_DIRS) + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +EXTRA_CLEAN += $(LIBCGROUP_O) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 600123503063..102262555a59 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -5,6 +5,8 @@ #include <linux/sched.h> #include <sys/types.h> #include <sys/mman.h> +#include <sys/mount.h> +#include <sys/stat.h> #include <sys/wait.h> #include <unistd.h> #include <fcntl.h> @@ -15,9 +17,14 @@ #include <string.h> #include <pthread.h> -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" +static bool nsdelegate; +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0 +#endif + static int touch_anon(char *buf, size_t size) { int fd; @@ -146,6 +153,9 @@ static int test_cgcore_populated(const char *root) int cgroup_fd = -EBADF; pid_t pid; + if (cg_test_v1_named) + return KSFT_SKIP; + cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); cg_test_c = cg_name(root, "cg_test_a/cg_test_b/cg_test_c"); @@ -275,6 +285,9 @@ static int test_cgcore_invalid_domain(const char *root) int ret = KSFT_FAIL; char *grandparent = NULL, *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + grandparent = cg_name(root, "cg_test_grandparent"); parent = cg_name(root, "cg_test_grandparent/cg_test_parent"); child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child"); @@ -337,6 +350,9 @@ static int test_cgcore_parent_becomes_threaded(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -376,7 +392,8 @@ static int test_cgcore_no_internal_process_constraint_on_threads(const char *roo int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; - if (cg_read_strstr(root, "cgroup.controllers", "cpu") || + if (cg_test_v1_named || + cg_read_strstr(root, "cgroup.controllers", "cpu") || cg_write(root, "cgroup.subtree_control", "+cpu")) { ret = KSFT_SKIP; goto cleanup; @@ -428,6 +445,9 @@ static int test_cgcore_top_down_constraint_enable(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -463,6 +483,9 @@ static int test_cgcore_top_down_constraint_disable(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -504,6 +527,9 @@ static int test_cgcore_internal_process_constraint(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -571,7 +597,7 @@ static int test_cgcore_proc_migration(const char *root) } cg_enter_current(dst); - if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) + if (cg_read_lc(dst, CG_THREADS_FILE) != n_threads + 1) goto cleanup; ret = KSFT_PASS; @@ -603,7 +629,7 @@ static void *migrating_thread_fn(void *arg) char lines[3][PATH_MAX]; for (g = 1; g < 3; ++g) - snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); + snprintf(lines[g], sizeof(lines[g]), CG_PATH_FORMAT, grps[g] + strlen(grps[0])); for (i = 0; i < n_iterations; ++i) { cg_enter_current_thread(grps[(i % 2) + 1]); @@ -640,10 +666,12 @@ static int test_cgcore_thread_migration(const char *root) if (cg_create(grps[2])) goto cleanup; - if (cg_write(grps[1], "cgroup.type", "threaded")) - goto cleanup; - if (cg_write(grps[2], "cgroup.type", "threaded")) - goto cleanup; + if (!cg_test_v1_named) { + if (cg_write(grps[1], "cgroup.type", "threaded")) + goto cleanup; + if (cg_write(grps[2], "cgroup.type", "threaded")) + goto cleanup; + } if (cg_enter_current(grps[1])) goto cleanup; @@ -657,7 +685,7 @@ static int test_cgcore_thread_migration(const char *root) if (retval) goto cleanup; - snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); + snprintf(line, sizeof(line), CG_PATH_FORMAT, grps[1] + strlen(grps[0])); if (proc_read_strstr(0, 1, "cgroup", line)) goto cleanup; @@ -683,7 +711,7 @@ cleanup: */ static int test_cgcore_lesser_euid_open(const char *root) { - const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + const uid_t test_euid = TEST_UID; int ret = KSFT_FAIL; char *cg_test_a = NULL, *cg_test_b = NULL; char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; @@ -775,6 +803,9 @@ static int test_cgcore_lesser_ns_open(const char *root) pid_t pid; int status; + if (!nsdelegate) + return KSFT_SKIP; + cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_b"); @@ -837,6 +868,38 @@ cleanup: return ret; } +static int setup_named_v1_root(char *root, size_t len, const char *name) +{ + char options[PATH_MAX]; + int r; + + r = snprintf(root, len, "/mnt/cg_selftest"); + if (r < 0) + return r; + + r = snprintf(options, sizeof(options), "none,name=%s", name); + if (r < 0) + return r; + + r = mkdir(root, 0755); + if (r < 0 && errno != EEXIST) + return r; + + r = mount("none", root, "cgroup", 0, options); + if (r < 0) + return r; + + return 0; +} + +static void cleanup_named_v1_root(char *root) +{ + if (!cg_test_v1_named) + return; + umount(root); + rmdir(root); +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -860,15 +923,22 @@ struct corecg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; - - if (cg_find_unified_root(root, sizeof(root))) - ksft_exit_skip("cgroup v2 isn't mounted\n"); + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { + if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) + ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); + cg_test_v1_named = true; + goto post_v2_setup; + } if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) if (cg_write(root, "cgroup.subtree_control", "+memory")) ksft_exit_skip("Failed to set memory controller\n"); +post_v2_setup: for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { case KSFT_PASS: @@ -878,11 +948,11 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + cleanup_named_v1_root(root); + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 24020a2c68dc..c83f05438d7c 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -2,14 +2,16 @@ #define _GNU_SOURCE #include <linux/limits.h> +#include <sys/param.h> #include <sys/sysinfo.h> #include <sys/wait.h> #include <errno.h> #include <pthread.h> #include <stdio.h> #include <time.h> +#include <unistd.h> -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" enum hog_clock_type { @@ -217,7 +219,7 @@ static int test_cpucg_stats(const char *root) if (user_usec <= 0) goto cleanup; - if (!values_close(usage_usec, expected_usage_usec, 1)) + if (!values_close_report(usage_usec, expected_usage_usec, 1)) goto cleanup; ret = KSFT_PASS; @@ -229,6 +231,79 @@ cleanup: return ret; } +/* + * Creates a nice process that consumes CPU and checks that the elapsed + * usertime in the cgroup is close to the expected time. + */ +static int test_cpucg_nice(const char *root) +{ + int ret = KSFT_FAIL; + int status; + long user_usec, nice_usec; + long usage_seconds = 2; + long expected_nice_usec = usage_seconds * USEC_PER_SEC; + char *cpucg; + pid_t pid; + + cpucg = cg_name(root, "cpucg_test"); + if (!cpucg) + goto cleanup; + + if (cg_create(cpucg)) + goto cleanup; + + user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); + nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec"); + if (nice_usec == -1) + ret = KSFT_SKIP; + if (user_usec != 0 || nice_usec != 0) + goto cleanup; + + /* + * We fork here to create a new process that can be niced without + * polluting the nice value of other selftests + */ + pid = fork(); + if (pid < 0) { + goto cleanup; + } else if (pid == 0) { + struct cpu_hog_func_param param = { + .nprocs = 1, + .ts = { + .tv_sec = usage_seconds, + .tv_nsec = 0, + }, + .clock_type = CPU_HOG_CLOCK_PROCESS, + }; + char buf[64]; + snprintf(buf, sizeof(buf), "%d", getpid()); + if (cg_write(cpucg, "cgroup.procs", buf)) + goto cleanup; + + /* Try to keep niced CPU usage as constrained to hog_cpu as possible */ + nice(1); + hog_cpus_timed(cpucg, ¶m); + exit(0); + } else { + waitpid(pid, &status, 0); + if (!WIFEXITED(status)) + goto cleanup; + + user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); + nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec"); + if (!values_close_report(nice_usec, expected_nice_usec, 1)) + goto cleanup; + + ret = KSFT_PASS; + } + +cleanup: + cg_destroy(cpucg); + free(cpucg); + + return ret; +} + static int run_cpucg_weight_test( const char *root, @@ -237,7 +312,7 @@ run_cpucg_weight_test( { int ret = KSFT_FAIL, i; char *parent = NULL; - struct cpu_hogger children[3] = {NULL}; + struct cpu_hogger children[3] = {}; parent = cg_name(root, "cpucg_test_0"); if (!parent) @@ -329,7 +404,7 @@ overprovision_validate(const struct cpu_hogger *children, int num_children) goto cleanup; delta = children[i + 1].usage - children[i].usage; - if (!values_close(delta, children[0].usage, 35)) + if (!values_close_report(delta, children[0].usage, 35)) goto cleanup; } @@ -369,7 +444,7 @@ underprovision_validate(const struct cpu_hogger *children, int num_children) int ret = KSFT_FAIL, i; for (i = 0; i < num_children - 1; i++) { - if (!values_close(children[i + 1].usage, children[0].usage, 15)) + if (!values_close_report(children[i + 1].usage, children[0].usage, 15)) goto cleanup; } @@ -408,7 +483,7 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned) { int ret = KSFT_FAIL, i; char *parent = NULL, *child = NULL; - struct cpu_hogger leaf[3] = {NULL}; + struct cpu_hogger leaf[3] = {}; long nested_leaf_usage, child_usage; int nprocs = get_nprocs(); @@ -498,16 +573,16 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned) nested_leaf_usage = leaf[1].usage + leaf[2].usage; if (overprovisioned) { - if (!values_close(leaf[0].usage, nested_leaf_usage, 15)) + if (!values_close_report(leaf[0].usage, nested_leaf_usage, 15)) goto cleanup; - } else if (!values_close(leaf[0].usage * 2, nested_leaf_usage, 15)) + } else if (!values_close_report(leaf[0].usage * 2, nested_leaf_usage, 15)) goto cleanup; child_usage = cg_read_key_long(child, "cpu.stat", "usage_usec"); if (child_usage <= 0) goto cleanup; - if (!values_close(child_usage, nested_leaf_usage, 1)) + if (!values_close_report(child_usage, nested_leaf_usage, 1)) goto cleanup; ret = KSFT_PASS; @@ -571,10 +646,16 @@ test_cpucg_nested_weight_underprovisioned(const char *root) static int test_cpucg_max(const char *root) { int ret = KSFT_FAIL; - long usage_usec, user_usec; - long usage_seconds = 1; - long expected_usage_usec = usage_seconds * USEC_PER_SEC; + long quota_usec = 1000; + long default_period_usec = 100000; /* cpu.max's default period */ + long duration_seconds = 1; + + long duration_usec = duration_seconds * USEC_PER_SEC; + long usage_usec, n_periods, remainder_usec, expected_usage_usec; char *cpucg; + char quota_buf[32]; + + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); cpucg = cg_name(root, "cpucg_test"); if (!cpucg) @@ -583,13 +664,13 @@ static int test_cpucg_max(const char *root) if (cg_create(cpucg)) goto cleanup; - if (cg_write(cpucg, "cpu.max", "1000")) + if (cg_write(cpucg, "cpu.max", quota_buf)) goto cleanup; struct cpu_hog_func_param param = { .nprocs = 1, .ts = { - .tv_sec = usage_seconds, + .tv_sec = duration_seconds, .tv_nsec = 0, }, .clock_type = CPU_HOG_CLOCK_WALL, @@ -598,14 +679,19 @@ static int test_cpucg_max(const char *root) goto cleanup; usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec"); - user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); - if (user_usec <= 0) + if (usage_usec <= 0) goto cleanup; - if (user_usec >= expected_usage_usec) - goto cleanup; + /* + * The following calculation applies only since + * the cpu hog is set to run as per wall-clock time + */ + n_periods = duration_usec / default_period_usec; + remainder_usec = duration_usec - n_periods * default_period_usec; + expected_usage_usec + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); - if (values_close(usage_usec, expected_usage_usec, 95)) + if (!values_close_report(usage_usec, expected_usage_usec, 10)) goto cleanup; ret = KSFT_PASS; @@ -624,10 +710,16 @@ cleanup: static int test_cpucg_max_nested(const char *root) { int ret = KSFT_FAIL; - long usage_usec, user_usec; - long usage_seconds = 1; - long expected_usage_usec = usage_seconds * USEC_PER_SEC; + long quota_usec = 1000; + long default_period_usec = 100000; /* cpu.max's default period */ + long duration_seconds = 1; + + long duration_usec = duration_seconds * USEC_PER_SEC; + long usage_usec, n_periods, remainder_usec, expected_usage_usec; char *parent, *child; + char quota_buf[32]; + + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); parent = cg_name(root, "cpucg_parent"); child = cg_name(parent, "cpucg_child"); @@ -643,13 +735,13 @@ static int test_cpucg_max_nested(const char *root) if (cg_create(child)) goto cleanup; - if (cg_write(parent, "cpu.max", "1000")) + if (cg_write(parent, "cpu.max", quota_buf)) goto cleanup; struct cpu_hog_func_param param = { .nprocs = 1, .ts = { - .tv_sec = usage_seconds, + .tv_sec = duration_seconds, .tv_nsec = 0, }, .clock_type = CPU_HOG_CLOCK_WALL, @@ -658,14 +750,19 @@ static int test_cpucg_max_nested(const char *root) goto cleanup; usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec"); - user_usec = cg_read_key_long(child, "cpu.stat", "user_usec"); - if (user_usec <= 0) + if (usage_usec <= 0) goto cleanup; - if (user_usec >= expected_usage_usec) - goto cleanup; + /* + * The following calculation applies only since + * the cpu hog is set to run as per wall-clock time + */ + n_periods = duration_usec / default_period_usec; + remainder_usec = duration_usec - n_periods * default_period_usec; + expected_usage_usec + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); - if (values_close(usage_usec, expected_usage_usec, 95)) + if (!values_close_report(usage_usec, expected_usage_usec, 10)) goto cleanup; ret = KSFT_PASS; @@ -686,6 +783,7 @@ struct cpucg_test { } tests[] = { T(test_cpucg_subtree_control), T(test_cpucg_stats), + T(test_cpucg_nice), T(test_cpucg_weight_overprovisioned), T(test_cpucg_weight_underprovisioned), T(test_cpucg_nested_weight_overprovisioned), @@ -698,9 +796,11 @@ struct cpucg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; - if (cg_find_unified_root(root, sizeof(root))) + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "cpu")) @@ -716,11 +816,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c new file mode 100644 index 000000000000..c5cf8b56ceb8 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/limits.h> +#include <signal.h> + +#include "kselftest.h" +#include "cgroup_util.h" + +static int idle_process_fn(const char *cgroup, void *arg) +{ + (void)pause(); + return 0; +} + +static int do_migration_fn(const char *cgroup, void *arg) +{ + int object_pid = (int)(size_t)arg; + + if (setuid(TEST_UID)) + return EXIT_FAILURE; + + // XXX checking /proc/$pid/cgroup would be quicker than wait + if (cg_enter(cgroup, object_pid) || + cg_wait_for_proc_count(cgroup, 1)) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +static int do_controller_fn(const char *cgroup, void *arg) +{ + const char *child = cgroup; + const char *parent = arg; + + if (setuid(TEST_UID)) + return EXIT_FAILURE; + + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) + return EXIT_FAILURE; + + if (cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + if (cg_write(parent, "cgroup.subtree_control", "-cpuset")) + return EXIT_FAILURE; + + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +/* + * Migrate a process between two sibling cgroups. + * The success should only depend on the parent cgroup permissions and not the + * migrated process itself (cpuset controller is in place because it uses + * security_task_setscheduler() in cgroup v1). + * + * Deliberately don't set cpuset.cpus in children to avoid definining migration + * permissions between two different cpusets. + */ +static int test_cpuset_perms_object(const char *root, bool allow) +{ + char *parent = NULL, *child_src = NULL, *child_dst = NULL; + char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL; + const uid_t test_euid = TEST_UID; + int object_pid = 0; + int ret = KSFT_FAIL; + + parent = cg_name(root, "cpuset_test_0"); + if (!parent) + goto cleanup; + parent_procs = cg_name(parent, "cgroup.procs"); + if (!parent_procs) + goto cleanup; + if (cg_create(parent)) + goto cleanup; + + child_src = cg_name(parent, "cpuset_test_1"); + if (!child_src) + goto cleanup; + child_src_procs = cg_name(child_src, "cgroup.procs"); + if (!child_src_procs) + goto cleanup; + if (cg_create(child_src)) + goto cleanup; + + child_dst = cg_name(parent, "cpuset_test_2"); + if (!child_dst) + goto cleanup; + child_dst_procs = cg_name(child_dst, "cgroup.procs"); + if (!child_dst_procs) + goto cleanup; + if (cg_create(child_dst)) + goto cleanup; + + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) + goto cleanup; + + if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") || + cg_read_strstr(child_dst, "cgroup.controllers", "cpuset")) + goto cleanup; + + /* Enable permissions along src->dst tree path */ + if (chown(child_src_procs, test_euid, -1) || + chown(child_dst_procs, test_euid, -1)) + goto cleanup; + + if (allow && chown(parent_procs, test_euid, -1)) + goto cleanup; + + /* Fork a privileged child as a test object */ + object_pid = cg_run_nowait(child_src, idle_process_fn, NULL); + if (object_pid < 0) + goto cleanup; + + /* Carry out migration in a child process that can drop all privileges + * (including capabilities), the main process must remain privileged for + * cleanup. + * Child process's cgroup is irrelevant but we place it into child_dst + * as hacky way to pass information about migration target to the child. + */ + if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (object_pid > 0) { + (void)kill(object_pid, SIGTERM); + (void)clone_reap(object_pid, WEXITED); + } + + cg_destroy(child_dst); + free(child_dst_procs); + free(child_dst); + + cg_destroy(child_src); + free(child_src_procs); + free(child_src); + + cg_destroy(parent); + free(parent_procs); + free(parent); + + return ret; +} + +static int test_cpuset_perms_object_allow(const char *root) +{ + return test_cpuset_perms_object(root, true); +} + +static int test_cpuset_perms_object_deny(const char *root) +{ + return test_cpuset_perms_object(root, false); +} + +/* + * Migrate a process between parent and child implicitely + * Implicit migration happens when a controller is enabled/disabled. + * + */ +static int test_cpuset_perms_subtree(const char *root) +{ + char *parent = NULL, *child = NULL; + char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL; + const uid_t test_euid = TEST_UID; + int object_pid = 0; + int ret = KSFT_FAIL; + + parent = cg_name(root, "cpuset_test_0"); + if (!parent) + goto cleanup; + parent_procs = cg_name(parent, "cgroup.procs"); + if (!parent_procs) + goto cleanup; + parent_subctl = cg_name(parent, "cgroup.subtree_control"); + if (!parent_subctl) + goto cleanup; + if (cg_create(parent)) + goto cleanup; + + child = cg_name(parent, "cpuset_test_1"); + if (!child) + goto cleanup; + child_procs = cg_name(child, "cgroup.procs"); + if (!child_procs) + goto cleanup; + if (cg_create(child)) + goto cleanup; + + /* Enable permissions as in a delegated subtree */ + if (chown(parent_procs, test_euid, -1) || + chown(parent_subctl, test_euid, -1) || + chown(child_procs, test_euid, -1)) + goto cleanup; + + /* Put a privileged child in the subtree and modify controller state + * from an unprivileged process, the main process remains privileged + * for cleanup. + * The unprivileged child runs in subtree too to avoid parent and + * internal-node constraing violation. + */ + object_pid = cg_run_nowait(child, idle_process_fn, NULL); + if (object_pid < 0) + goto cleanup; + + if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (object_pid > 0) { + (void)kill(object_pid, SIGTERM); + (void)clone_reap(object_pid, WEXITED); + } + + cg_destroy(child); + free(child_procs); + free(child); + + cg_destroy(parent); + free(parent_subctl); + free(parent_procs); + free(parent); + + return ret; +} + + +#define T(x) { x, #x } +struct cpuset_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_cpuset_perms_object_allow), + T(test_cpuset_perms_object_deny), + T(test_cpuset_perms_subtree), +}; +#undef T + +int main(int argc, char *argv[]) +{ + char root[PATH_MAX]; + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset")) + if (cg_write(root, "cgroup.subtree_control", "+cpuset")) + ksft_exit_skip("Failed to set cpuset controller\n"); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + ksft_finished(); +} diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 186e1c26867e..a17256d9f88a 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -3,25 +3,18 @@ # # Test for cpuset v2 partition root state (PRS) # -# The sched verbose flag is set, if available, so that the console log +# The sched verbose flag can be optionally set so that the console log # can be examined for the correct setting of scheduling domain. # skip_test() { echo "$1" echo "Test SKIPPED" - exit 0 + exit 4 # ksft_skip } [[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" -# Set sched verbose flag, if available -if [[ -d /sys/kernel/debug/sched ]] -then - # Used to restore the original setting during cleanup - SCHED_DEBUG=$(cat /sys/kernel/debug/sched/verbose) - echo Y > /sys/kernel/debug/sched/verbose -fi # Get wait_inotify location WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify @@ -29,23 +22,35 @@ WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify # Find cgroup v2 mount point CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') [[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!" +SUBPARTS_CPUS=$CGROUP2/.__DEBUG__.cpuset.cpus.subpartitions +CPULIST=$(cat $CGROUP2/cpuset.cpus.effective) -CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//") -[[ $CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!" +NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//") +[[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!" + +# Check to see if /dev/console exists and is writable +if [[ -c /dev/console && -w /dev/console ]] +then + CONSOLE=/dev/console +else + CONSOLE=/dev/null +fi # Set verbose flag and delay factor PROG=$1 -VERBOSE= +VERBOSE=0 DELAY_FACTOR=1 +SCHED_DEBUG= while [[ "$1" = -* ]] do case "$1" in - -v) VERBOSE=1 - break + -v) ((VERBOSE++)) + # Enable sched/verbose can slow thing down + [[ $DELAY_FACTOR -eq 1 ]] && + DELAY_FACTOR=2 ;; -d) DELAY_FACTOR=$2 shift - break ;; *) echo "Usage: $PROG [-v] [-d <delay-factor>" exit @@ -54,18 +59,63 @@ do shift done +# Set sched verbose flag if available when "-v" option is specified +if [[ $VERBOSE -gt 0 && -d /sys/kernel/debug/sched ]] +then + # Used to restore the original setting during cleanup + SCHED_DEBUG=$(cat /sys/kernel/debug/sched/verbose) + echo Y > /sys/kernel/debug/sched/verbose +fi + cd $CGROUP2 echo +cpuset > cgroup.subtree_control + +# +# If cpuset has been set up and used in child cgroups, we may not be able to +# create partition under root cgroup because of the CPU exclusivity rule. +# So we are going to skip the test if this is the case. +# [[ -d test ]] || mkdir test -cd test +echo 0-6 > test/cpuset.cpus +echo root > test/cpuset.cpus.partition +cat test/cpuset.cpus.partition | grep -q invalid +RESULT=$? +echo member > test/cpuset.cpus.partition +echo "" > test/cpuset.cpus +[[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!" + +# +# If isolated CPUs have been reserved at boot time (as shown in +# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8 +# that will be used by this script for testing purpose. If not, some of +# the tests may fail incorrectly. Wait a bit and retry again just in case +# these isolated CPUs are leftover from previous run and have just been +# cleaned up earlier in this script. +# +# These pre-isolated CPUs should stay in an isolated state throughout the +# testing process for now. +# +BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) +[[ -n "$BOOT_ISOLCPUS" ]] && { + sleep 0.5 + BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) +} +if [[ -n "$BOOT_ISOLCPUS" ]] +then + [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] && + skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" + echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" +fi cleanup() { online_cpus - rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1 - cd .. - rmdir test > /dev/null 2>&1 - echo "$SCHED_DEBUG" > /sys/kernel/debug/sched/verbose + cd $CGROUP2 + rmdir A1/A2/A3 A1/A2 A1 B1 test/A1 test/B1 test > /dev/null 2>&1 + rmdir rtest/p1/c11 rtest/p1/c12 rtest/p2/c21 \ + rtest/p2/c22 rtest/p1 rtest/p2 rtest > /dev/null 2>&1 + [[ -n "$SCHED_DEBUG" ]] && + echo "$SCHED_DEBUG" > /sys/kernel/debug/sched/verbose } # Pause in ms @@ -85,8 +135,8 @@ console_msg() { MSG=$1 echo "$MSG" - echo "" > /dev/console - echo "$MSG" > /dev/console + echo "" > $CONSOLE + echo "$MSG" > $CONSOLE pause 0.01 } @@ -97,7 +147,7 @@ test_partition() [[ $? -eq 0 ]] || exit 1 ACTUAL_VAL=$(cat cpuset.cpus.partition) [[ $ACTUAL_VAL != $EXPECTED_VAL ]] && { - echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $EXPECTED_VAL" + echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $ACTUAL_VAL" echo "Test FAILED" exit 1 } @@ -108,7 +158,7 @@ test_effective_cpus() EXPECTED_VAL=$1 ACTUAL_VAL=$(cat cpuset.cpus.effective) [[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && { - echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$EXPECTED_VAL'" + echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$ACTUAL_VAL'" echo "Test FAILED" exit 1 } @@ -129,192 +179,314 @@ test_add_proc() } # -# Testing the new "isolated" partition root type -# -test_isolated() -{ - echo 2-3 > cpuset.cpus - TYPE=$(cat cpuset.cpus.partition) - [[ $TYPE = member ]] || echo member > cpuset.cpus.partition - - console_msg "Change from member to root" - test_partition root - - console_msg "Change from root to isolated" - test_partition isolated - - console_msg "Change from isolated to member" - test_partition member - - console_msg "Change from member to isolated" - test_partition isolated - - console_msg "Change from isolated to root" - test_partition root - - console_msg "Change from root to member" - test_partition member - - # - # Testing partition root with no cpu - # - console_msg "Distribute all cpus to child partition" - echo +cpuset > cgroup.subtree_control - test_partition root - - mkdir A1 - cd A1 - echo 2-3 > cpuset.cpus - test_partition root - test_effective_cpus 2-3 - cd .. - test_effective_cpus "" - - console_msg "Moving task to partition test" - test_add_proc "No space left" - cd A1 - test_add_proc "" - cd .. - - console_msg "Shrink and expand child partition" - cd A1 - echo 2 > cpuset.cpus - cd .. - test_effective_cpus 3 - cd A1 - echo 2-3 > cpuset.cpus - cd .. - test_effective_cpus "" - - # Cleaning up - console_msg "Cleaning up" - echo $$ > $CGROUP2/cgroup.procs - [[ -d A1 ]] && rmdir A1 -} - -# # Cpuset controller state transition test matrix. # # Cgroup test hierarchy # -# test -- A1 -- A2 -- A3 -# \- B1 +# root +# | +# +------+------+ +# | | +# A1 B1 +# | +# A2 +# | +# A3 # -# P<v> = set cpus.partition (0:member, 1:root, 2:isolated, -1:root invalid) -# C<l> = add cpu-list +# P<v> = set cpus.partition (0:member, 1:root, 2:isolated) +# C<l> = add cpu-list to cpuset.cpus +# X<l> = add cpu-list to cpuset.cpus.exclusive # S<p> = use prefix in subtree_control # T = put a task into cgroup -# O<c>-<v> = Write <v> to CPU online file of <c> +# CX<l> = add cpu-list to both cpuset.cpus and cpuset.cpus.exclusive +# O<c>=<v> = Write <v> to CPU online file of <c> +# +# ECPUs - effective CPUs of cpusets +# Pstate - partition root state +# ISOLCPUS - isolated CPUs (<icpus>[,<icpus2>]) +# +# Note that if there are 2 fields in ISOLCPUS, the first one is for +# sched-debug matching which includes offline CPUs and single-CPU partitions +# while the second one is for matching cpuset.cpus.isolated. # SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" TEST_MATRIX=( - # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate - # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ - " S+ C0-1 . . C2-3 S+ C4-5 . . 0 A2:0-1" - " S+ C0-1 . . C2-3 P1 . . . 0 " - " S+ C0-1 . . C2-3 P1:S+ C0-1:P1 . . 0 " - " S+ C0-1 . . C2-3 P1:S+ C1:P1 . . 0 " - " S+ C0-1:S+ . . C2-3 . . . P1 0 " - " S+ C0-1:P1 . . C2-3 S+ C1 . . 0 " - " S+ C0-1:P1 . . C2-3 S+ C1:P1 . . 0 " - " S+ C0-1:P1 . . C2-3 S+ C1:P1 . P1 0 " - " S+ C0-1:P1 . . C2-3 C4-5 . . . 0 A1:4-5" - " S+ C0-1:P1 . . C2-3 S+:C4-5 . . . 0 A1:4-5" - " S+ C0-1 . . C2-3:P1 . . . C2 0 " - " S+ C0-1 . . C2-3:P1 . . . C4-5 0 B1:4-5" - " S+ C0-3:P1:S+ C2-3:P1 . . . . . . 0 A1:0-1,A2:2-3" - " S+ C0-3:P1:S+ C2-3:P1 . . C1-3 . . . 0 A1:1,A2:2-3" - " S+ C2-3:P1:S+ C3:P1 . . C3 . . . 0 A1:,A2:3 A1:P1,A2:P1" - " S+ C2-3:P1:S+ C3:P1 . . C3 P0 . . 0 A1:3,A2:3 A1:P1,A2:P0" - " S+ C2-3:P1:S+ C2:P1 . . C2-4 . . . 0 A1:3-4,A2:2" - " S+ C2-3:P1:S+ C3:P1 . . C3 . . C0-2 0 A1:,B1:0-2 A1:P1,A2:P1" - " S+ $SETUP_A123_PARTITIONS . C2-3 . . . 0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS + # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- + " C0-1 . . C2-3 S+ C4-5 . . 0 A2:0-1" + " C0-1 . . C2-3 P1 . . . 0 " + " C0-1 . . C2-3 P1:S+ C0-1:P1 . . 0 " + " C0-1 . . C2-3 P1:S+ C1:P1 . . 0 " + " C0-1:S+ . . C2-3 . . . P1 0 " + " C0-1:P1 . . C2-3 S+ C1 . . 0 " + " C0-1:P1 . . C2-3 S+ C1:P1 . . 0 " + " C0-1:P1 . . C2-3 S+ C1:P1 . P1 0 " + " C0-1:P1 . . C2-3 C4-5 . . . 0 A1:4-5" + " C0-1:P1 . . C2-3 S+:C4-5 . . . 0 A1:4-5" + " C0-1 . . C2-3:P1 . . . C2 0 " + " C0-1 . . C2-3:P1 . . . C4-5 0 B1:4-5" + "C0-3:P1:S+ C2-3:P1 . . . . . . 0 A1:0-1|A2:2-3|XA2:2-3" + "C0-3:P1:S+ C2-3:P1 . . C1-3 . . . 0 A1:1|A2:2-3|XA2:2-3" + "C2-3:P1:S+ C3:P1 . . C3 . . . 0 A1:|A2:3|XA2:3 A1:P1|A2:P1" + "C2-3:P1:S+ C3:P1 . . C3 P0 . . 0 A1:3|A2:3 A1:P1|A2:P0" + "C2-3:P1:S+ C2:P1 . . C2-4 . . . 0 A1:3-4|A2:2" + "C2-3:P1:S+ C3:P1 . . C3 . . C0-2 0 A1:|B1:0-2 A1:P1|A2:P1" + "$SETUP_A123_PARTITIONS . C2-3 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1" # CPU offlining cases: - " S+ C0-1 . . C2-3 S+ C4-5 . O2-0 0 A1:0-1,B1:3" - " S+ C0-3:P1:S+ C2-3:P1 . . O2-0 . . . 0 A1:0-1,A2:3" - " S+ C0-3:P1:S+ C2-3:P1 . . O2-0 O2-1 . . 0 A1:0-1,A2:2-3" - " S+ C0-3:P1:S+ C2-3:P1 . . O1-0 . . . 0 A1:0,A2:2-3" - " S+ C0-3:P1:S+ C2-3:P1 . . O1-0 O1-1 . . 0 A1:0-1,A2:2-3" - " S+ C2-3:P1:S+ C3:P1 . . O3-0 O3-1 . . 0 A1:2,A2:3 A1:P1,A2:P1" - " S+ C2-3:P1:S+ C3:P2 . . O3-0 O3-1 . . 0 A1:2,A2:3 A1:P1,A2:P2" - " S+ C2-3:P1:S+ C3:P1 . . O2-0 O2-1 . . 0 A1:2,A2:3 A1:P1,A2:P1" - " S+ C2-3:P1:S+ C3:P2 . . O2-0 O2-1 . . 0 A1:2,A2:3 A1:P1,A2:P2" - " S+ C2-3:P1:S+ C3:P1 . . O2-0 . . . 0 A1:,A2:3 A1:P1,A2:P1" - " S+ C2-3:P1:S+ C3:P1 . . O3-0 . . . 0 A1:2,A2: A1:P1,A2:P1" - " S+ C2-3:P1:S+ C3:P1 . . T:O2-0 . . . 0 A1:3,A2:3 A1:P1,A2:P-1" - " S+ C2-3:P1:S+ C3:P1 . . . T:O3-0 . . 0 A1:2,A2:2 A1:P1,A2:P-1" - " S+ $SETUP_A123_PARTITIONS . O1-0 . . . 0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . O2-0 . . . 0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . O3-0 . . . 0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . T:O1-0 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" - " S+ $SETUP_A123_PARTITIONS . . T:O2-0 . . 0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1" - " S+ $SETUP_A123_PARTITIONS . . . T:O3-0 . 0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1" - " S+ $SETUP_A123_PARTITIONS . T:O1-0 O1-1 . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . . T:O2-0 O2-1 . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . . . T:O3-0 O3-1 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . T:O1-0 O2-0 O1-1 . 0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1" - " S+ $SETUP_A123_PARTITIONS . T:O1-0 O2-0 O2-1 . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" - - # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate - # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ + " C0-1 . . C2-3 S+ C4-5 . O2=0 0 A1:0-1|B1:3" + "C0-3:P1:S+ C2-3:P1 . . O2=0 . . . 0 A1:0-1|A2:3" + "C0-3:P1:S+ C2-3:P1 . . O2=0 O2=1 . . 0 A1:0-1|A2:2-3" + "C0-3:P1:S+ C2-3:P1 . . O1=0 . . . 0 A1:0|A2:2-3" + "C0-3:P1:S+ C2-3:P1 . . O1=0 O1=1 . . 0 A1:0-1|A2:2-3" + "C2-3:P1:S+ C3:P1 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" + "C2-3:P1:S+ C3:P2 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" + "C2-3:P1:S+ C3:P1 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" + "C2-3:P1:S+ C3:P2 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" + "C2-3:P1:S+ C3:P1 . . O2=0 . . . 0 A1:|A2:3 A1:P1|A2:P1" + "C2-3:P1:S+ C3:P1 . . O3=0 . . . 0 A1:2|A2: A1:P1|A2:P1" + "C2-3:P1:S+ C3:P1 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-1" + "C2-3:P1:S+ C3:P1 . . . T:O3=0 . . 0 A1:2|A2:2 A1:P1|A2:P-1" + "$SETUP_A123_PARTITIONS . O1=0 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . O2=0 . . . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . O3=0 . . . 0 A1:1|A2:2|A3: A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . T:O1=0 . . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1" + "$SETUP_A123_PARTITIONS . . T:O2=0 . . 0 A1:1|A2:3|A3:3 A1:P1|A2:P1|A3:P-1" + "$SETUP_A123_PARTITIONS . . . T:O3=0 . 0 A1:1|A2:2|A3:2 A1:P1|A2:P1|A3:P-1" + "$SETUP_A123_PARTITIONS . T:O1=0 O1=1 . . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . . T:O2=0 O2=1 . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . . . T:O3=0 O3=1 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . T:O1=0 O2=0 O1=1 . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1" + "$SETUP_A123_PARTITIONS . T:O1=0 O2=0 O2=1 . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1" + + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS + # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- + # + # Remote partition and cpuset.cpus.exclusive tests + # + " C0-3:S+ C1-3:S+ C2-3 . X2-3 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:2-3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1|A2:2-3|A3:2-3 A1:P0|A2:P2 2-3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2" + " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" + " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1|A3:2-3 A2:P2|A3:P2 1-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3|B1:4-5 A3:P2|B1:P2 2-5" + " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4|A2:1-3|A3:1-3 A2:P2 1-3" + " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4|A2:4|A3:2-3 A3:P2 2-3" + + # Nested remote/local partition tests + " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4-5 \ + A1:P0|A2:P1|A3:P2|B1:P1 2-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4 \ + A1:P0|A2:P1|A3:P2|B1:P1 2-4|2-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1|A2:2-3|A3:2-3|B1:4 \ + A1:P0|A2:P1|A3:P0|B1:P1" + " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:2|A3:3|B1:4 \ + A1:P0|A2:P1|A3:P2|B1:P1 2-4|3" + " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1|A2:2-3|A3:4 \ + A1:P0|A2:P2|A3:P1 2-4|2-3" + " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1|A2:2|A3:3-4 \ + A1:P0|A2:P2|A3:P1 2" + " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ + . . X5 . . 0 A1:0-4|A2:1-4|A3:2-4 \ + A1:P0|A2:P-2|A3:P-1 ." + " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ + . . . X1 . 0 A1:0-1|A2:2-4|A3:2-4 \ + A1:P0|A2:P2|A3:P-1 2-4" + + # Remote partition offline tests + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1|A2:1|A3:3 A1:P0|A3:P2 2-3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2|A2:1-2|A3: A1:P0|A3:P2 3" + " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2|A2:1-2|A3:1-2 A1:P0|A3:P-2 3|" + + # An invalidated remote partition cannot self-recover from hotplug + " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3|A2:1-3|A3:2 A1:P0|A3:P-2 ." + + # cpus.exclusive.effective clearing test + " C0-3:S+ C1-3:S+ C2 . X2-3:X . . . 0 A1:0-3|A2:1-3|A3:2|XA1:" + + # Invalid to valid remote partition transition test + " C0-3:S+ C1-3 . . . X3:P2 . . 0 A1:0-3|A2:1-3|XA2: A2:P-2 ." + " C0-3:S+ C1-3:X3:P2 + . . X2-3 P2 . . 0 A1:0-2|A2:3|XA2:3 A2:P2 3" + + # Invalid to valid local partition direct transition tests + " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" + " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:4-6 A1:P-2|B1:P0" + " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" + + # Local partition invalidation tests + " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + . . . . . 0 A1:1|A2:2|A3:3 A1:P2|A2:P2|A3:P2 1-3" + " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + . . X4 . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3" + " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + . . C4:X . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3" + # Local partition CPU change tests + " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2|A2:3-5 A1:P2|A2:P1 0-2" + " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3|A2:4-5 A1:P2|A2:P1 1-3" + + # cpus_allowed/exclusive_cpus update tests + " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + . X:C4 . P2 . 0 A1:4|A2:4|XA2:|XA3:|A3:4 \ + A1:P0|A3:P-2 ." + " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + . X1 . P2 . 0 A1:0-3|A2:1-3|XA1:1|XA2:|XA3:|A3:2-3 \ + A1:P0|A3:P-2 ." + " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + . . X3 P2 . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3 \ + A1:P0|A3:P2 3" + " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ + . . X3 . . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3|XA3:3 \ + A1:P0|A3:P2 3" + " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ + . X4 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:4|XA2:|XA3 \ + A1:P0|A3:P-2" + + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS + # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # - # Incorrect change to cpuset.cpus invalidates partition root + # Incorrect change to cpuset.cpus[.exclusive] invalidates partition root # # Adding CPUs to partition root that are not in parent's # cpuset.cpus is allowed, but those extra CPUs are ignored. - " S+ C2-3:P1:S+ C3:P1 . . . C2-4 . . 0 A1:,A2:2-3 A1:P1,A2:P1" + "C2-3:P1:S+ C3:P1 . . . C2-4 . . 0 A1:|A2:2-3 A1:P1|A2:P1" # Taking away all CPUs from parent or itself if there are tasks # will make the partition invalid. - " S+ C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3,A2:2-3 A1:P1,A2:P-1" - " S+ $SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" - " S+ $SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + "C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3|A2:2-3 A1:P1|A2:P-1" + " C3:P1:S+ C3 . . T P1 . . 0 A1:3|A2:3 A1:P1|A2:P-1" + "$SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1" + "$SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1" # Changing a partition root to member makes child partitions invalid - " S+ C2-3:P1:S+ C3:P1 . . P0 . . . 0 A1:2-3,A2:3 A1:P0,A2:P-1" - " S+ $SETUP_A123_PARTITIONS . C2-3 P0 . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1" + "C2-3:P1:S+ C3:P1 . . P0 . . . 0 A1:2-3|A2:3 A1:P0|A2:P-1" + "$SETUP_A123_PARTITIONS . C2-3 P0 . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P0|A3:P-1" # cpuset.cpus can contains cpus not in parent's cpuset.cpus as long # as they overlap. - " S+ C2-3:P1:S+ . . . . C3-4:P1 . . 0 A1:2,A2:3 A1:P1,A2:P1" + "C2-3:P1:S+ . . . . C3-4:P1 . . 0 A1:2|A2:3 A1:P1|A2:P1" # Deletion of CPUs distributed to child cgroup is allowed. - " S+ C0-1:P1:S+ C1 . C2-3 C4-5 . . . 0 A1:4-5,A2:4-5" + "C0-1:P1:S+ C1 . C2-3 C4-5 . . . 0 A1:4-5|A2:4-5" # To become a valid partition root, cpuset.cpus must overlap parent's # cpuset.cpus. - " S+ C0-1:P1 . . C2-3 S+ C4-5:P1 . . 0 A1:0-1,A2:0-1 A1:P1,A2:P-1" + " C0-1:P1 . . C2-3 S+ C4-5:P1 . . 0 A1:0-1|A2:0-1 A1:P1|A2:P-1" # Enabling partition with child cpusets is allowed - " S+ C0-1:S+ C1 . C2-3 P1 . . . 0 A1:0-1,A2:1 A1:P1" + " C0-1:S+ C1 . C2-3 P1 . . . 0 A1:0-1|A2:1 A1:P1" - # A partition root with non-partition root parent is invalid, but it + # A partition root with non-partition root parent is invalid| but it # can be made valid if its parent becomes a partition root too. - " S+ C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1,A2:1 A1:P0,A2:P-2" - " S+ C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0,A2:1 A1:P1,A2:P2" + " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" + " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" # A non-exclusive cpuset.cpus change will invalidate partition and its siblings - " S+ C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P0" - " S+ C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1" - " S+ C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1" - - # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate - # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ + " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P0" + " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P-1" + " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P0|B1:P-1" + + # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it + " C0-3 . . C4-5 X5 . . . 0 A1:0-3|B1:4-5" + + # Child partition root that try to take all CPUs from parent partition + # with tasks will remain invalid. + " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" + " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1|A2:1-4 A1:P1|A2:P1" + " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" + + # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't + # affect cpuset.cpus.exclusive.effective. + " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4|XA2:3" + + # cpuset.cpus can contain CPUs that overlap a sibling cpuset with cpus.exclusive + # but creating a local partition out of it is not allowed. Similarly and change + # in cpuset.cpus of a local partition that overlaps sibling exclusive CPUs will + # invalidate it. + " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1 0 A1:1|A2:2-4|B1:5-6|XB1:5-6 \ + A1:P0|A2:P2:B1:P1 2-4" + " CX1-4:S+ CX2-4:P2 . C3-6 . . . P1 0 A1:1|A2:2-4|B1:5-6 \ + A1:P0|A2:P2:B1:P-1 2-4" + " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ + A1:P0|A2:P2:B1:P-1 2-4" + + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS + # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: # A task cannot be added to a partition with no cpu - " S+ C2-3:P1:S+ C3:P1 . . O2-0:T . . . 1 A1:,A2:3 A1:P1,A2:P1" + "C2-3:P1:S+ C3:P1 . . O2=0:T . . . 1 A1:|A2:3 A1:P1|A2:P1" + + # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected + " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5" + + # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5" +) + +# +# Cpuset controller remote partition test matrix. +# +# Cgroup test hierarchy +# +# root +# | +# rtest (cpuset.cpus.exclusive=1-7) +# | +# +------+------+ +# | | +# p1 p2 +# +--+--+ +--+--+ +# | | | | +# c11 c12 c21 c22 +# +# REMOTE_TEST_MATRIX uses the same notational convention as TEST_MATRIX. +# Only CPUs 1-7 should be used. +# +REMOTE_TEST_MATRIX=( + # old-p1 old-p2 old-c11 old-c12 old-c21 old-c22 + # new-p1 new-p2 new-c11 new-c12 new-c21 new-c22 ECPUs Pstate ISOLCPUS + # ------ ------ ------- ------- ------- ------- ----- ------ -------- + " X1-3:S+ X4-6:S+ X1-2 X3 X4-5 X6 \ + . . P2 P2 P2 P2 c11:1-2|c12:3|c21:4-5|c22:6 \ + c11:P2|c12:P2|c21:P2|c22:P2 1-6" + " CX1-4:S+ . X1-2:P2 C3 . . \ + . . . C3-4 . . p1:3-4|c11:1-2|c12:3-4 \ + p1:P0|c11:P2|c12:P0 1-2" + " CX1-4:S+ . X1-2:P2 . . . \ + X2-4 . . . . . p1:1,3-4|c11:2 \ + p1:P0|c11:P2 2" + " CX1-5:S+ . X1-2:P2 X3-5:P1 . . \ + X2-4 . . . . . p1:1,5|c11:2|c12:3-4 \ + p1:P0|c11:P2|c12:P1 2" + " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \ + . . X2 . . . p1:1|c11:2|c12:3-4 \ + p1:P0|c11:P2|c12:P1 2" + # p1 as member, will get its effective CPUs from its parent rtest + " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \ + . . X1 CX2-4 . . p1:5-7|c11:1|c12:2-4 \ + p1:P0|c11:P2|c12:P1 1" + " CX1-4:S+ X5-6:P1:S+ . . . . \ + . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ + p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \ + 1-2,4-6|1-2,5-6" ) # # Write to the cpu online file -# $1 - <c>-<v> where <c> = cpu number, <v> value to be written +# $1 - <c>=<v> where <c> = cpu number, <v> value to be written # write_cpu_online() { - CPU=${1%-*} - VAL=${1#*-} + CPU=${1%=*} + VAL=${1#*=} CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online if [[ $VAL -eq 0 ]] then @@ -326,7 +498,7 @@ write_cpu_online() } fi echo $VAL > $CPUFILE - pause 0.01 + pause 0.05 } # @@ -342,11 +514,12 @@ set_ctrl_state() TMPMSG=/tmp/.msg_$$ CGRP=$1 STATE=$2 - SHOWERR=${3}${VERBOSE} + SHOWERR=${3} CTRL=${CTRL:=$CONTROLLER} HASERR=0 REDIRECT="2> $TMPMSG" [[ -z "$STATE" || "$STATE" = '.' ]] && return 0 + [[ $VERBOSE -gt 0 ]] && SHOWERR=1 rm -f $TMPMSG for CMD in $(echo $STATE | sed -e "s/:/ /g") @@ -355,20 +528,27 @@ set_ctrl_state() SFILE=$CGRP/cgroup.subtree_control PFILE=$CGRP/cpuset.cpus.partition CFILE=$CGRP/cpuset.cpus - S=$(expr substr $CMD 1 1) - if [[ $S = S ]] - then - PREFIX=${CMD#?} + XFILE=$CGRP/cpuset.cpus.exclusive + case $CMD in + S*) PREFIX=${CMD#?} COMM="echo ${PREFIX}${CTRL} > $SFILE" eval $COMM $REDIRECT - elif [[ $S = C ]] - then + ;; + X*) CPUS=${CMD#?} + COMM="echo $CPUS > $XFILE" + eval $COMM $REDIRECT + ;; + CX*) + CPUS=${CMD#??} + COMM="echo $CPUS > $CFILE; echo $CPUS > $XFILE" + eval $COMM $REDIRECT + ;; + C*) CPUS=${CMD#?} COMM="echo $CPUS > $CFILE" eval $COMM $REDIRECT - elif [[ $S = P ]] - then - VAL=${CMD#?} + ;; + P*) VAL=${CMD#?} case $VAL in 0) VAL=member ;; @@ -383,15 +563,17 @@ set_ctrl_state() esac COMM="echo $VAL > $PFILE" eval $COMM $REDIRECT - elif [[ $S = O ]] - then - VAL=${CMD#?} + ;; + O*) VAL=${CMD#?} write_cpu_online $VAL - elif [[ $S = T ]] - then - COMM="echo 0 > $TFILE" + ;; + T*) COMM="echo 0 > $TFILE" eval $COMM $REDIRECT - fi + ;; + *) echo "Unknown command: $CMD" + exit 1 + ;; + esac RET=$? [[ $RET -ne 0 ]] && { [[ -n "$SHOWERR" ]] && { @@ -423,81 +605,110 @@ online_cpus() [[ -n "OFFLINE_CPUS" ]] && { for C in $OFFLINE_CPUS do - write_cpu_online ${C}-1 + write_cpu_online ${C}=1 done } } # -# Return 1 if the list of effective cpus isn't the same as the initial list. +# Remove all the test cgroup directories # reset_cgroup_states() { echo 0 > $CGROUP2/cgroup.procs online_cpus - rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1 - set_ctrl_state . S- - pause 0.01 + rmdir $RESET_LIST > /dev/null 2>&1 } dump_states() { - for DIR in A1 A1/A2 A1/A2/A3 B1 + for DIR in $CGROUP_LIST do + CPUS=$DIR/cpuset.cpus ECPUS=$DIR/cpuset.cpus.effective + XCPUS=$DIR/cpuset.cpus.exclusive + XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition - [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" - [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" + PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions + ISCPUS=$DIR/cpuset.cpus.isolated + [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" + [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" + [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" + [[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)" + [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" + [[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)" + [[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)" done } # +# Set the actual cgroup directory into $CGRP_DIR +# $1 - cgroup name +# +set_cgroup_dir() +{ + CGRP_DIR=$1 + [[ $CGRP_DIR = A2 ]] && CGRP_DIR=A1/A2 + [[ $CGRP_DIR = A3 ]] && CGRP_DIR=A1/A2/A3 + [[ $CGRP_DIR = c11 ]] && CGRP_DIR=p1/c11 + [[ $CGRP_DIR = c12 ]] && CGRP_DIR=p1/c12 + [[ $CGRP_DIR = c21 ]] && CGRP_DIR=p2/c21 + [[ $CGRP_DIR = c22 ]] && CGRP_DIR=p2/c22 +} + +# # Check effective cpus -# $1 - check string, format: <cgroup>:<cpu-list>[,<cgroup>:<cpu-list>]* +# $1 - check string, format: <cgroup>:<cpu-list>[|<cgroup>:<cpu-list>]* # check_effective_cpus() { CHK_STR=$1 - for CHK in $(echo $CHK_STR | sed -e "s/,/ /g") + for CHK in $(echo $CHK_STR | sed -e "s/|/ /g") do set -- $(echo $CHK | sed -e "s/:/ /g") CGRP=$1 - CPUS=$2 - [[ $CGRP = A2 ]] && CGRP=A1/A2 - [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 - FILE=$CGRP/cpuset.cpus.effective - [[ -e $FILE ]] || return 1 - [[ $CPUS = $(cat $FILE) ]] || return 1 + EXPECTED_CPUS=$2 + ACTUAL_CPUS= + if [[ $CGRP = X* ]] + then + CGRP=${CGRP#X} + FILE=cpuset.cpus.exclusive.effective + else + FILE=cpuset.cpus.effective + fi + set_cgroup_dir $CGRP + [[ -e $CGRP_DIR/$FILE ]] || return 1 + ACTUAL_CPUS=$(cat $CGRP_DIR/$FILE) + [[ $EXPECTED_CPUS = $ACTUAL_CPUS ]] || return 1 done } # # Check cgroup states -# $1 - check string, format: <cgroup>:<state>[,<cgroup>:<state>]* +# $1 - check string, format: <cgroup>:<state>[|<cgroup>:<state>]* # check_cgroup_states() { CHK_STR=$1 - for CHK in $(echo $CHK_STR | sed -e "s/,/ /g") + for CHK in $(echo $CHK_STR | sed -e "s/|/ /g") do set -- $(echo $CHK | sed -e "s/:/ /g") CGRP=$1 - STATE=$2 + EXPECTED_STATE=$2 FILE= - EVAL=$(expr substr $STATE 2 2) - [[ $CGRP = A2 ]] && CGRP=A1/A2 - [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + EVAL=$(expr substr $EXPECTED_STATE 2 2) - case $STATE in - P*) FILE=$CGRP/cpuset.cpus.partition + set_cgroup_dir $CGRP + case $EXPECTED_STATE in + P*) FILE=$CGRP_DIR/cpuset.cpus.partition ;; - *) echo "Unknown state: $STATE!" + *) echo "Unknown state: $EXPECTED_STATE!" exit 1 ;; esac - VAL=$(cat $FILE) + ACTUAL_STATE=$(cat $FILE) - case "$VAL" in + case "$ACTUAL_STATE" in member) VAL=0 ;; root) VAL=1 @@ -513,11 +724,204 @@ check_cgroup_states() ;; esac [[ $EVAL != $VAL ]] && return 1 + + # + # For root partition, dump sched-domains info to console if + # verbose mode set for manual comparison with sched debug info. + # + [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && { + DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective) + [[ -n "$DOMS" ]] && + echo " [$CGRP_DIR] sched-domain: $DOMS" > $CONSOLE + } done return 0 } # +# Get isolated (including offline) CPUs by looking at +# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file, +# if available, and compare that with the expected value. +# +# Note that isolated CPUs from the sched/domains context include offline +# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may +# not be included in the cpuset.cpus.isolated control file which contains +# only CPUs in isolated partitions as well as those that are isolated at +# boot time. +# +# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>} +# <isolcpus1> - expected sched/domains value +# <isolcpus2> - cpuset.cpus.isolated value = <isolcpus1> if not defined +# +check_isolcpus() +{ + EXPECTED_ISOLCPUS=$1 + ISCPUS=${CGROUP2}/cpuset.cpus.isolated + ISOLCPUS=$(cat $ISCPUS) + LASTISOLCPU= + SCHED_DOMAINS=/sys/kernel/debug/sched/domains + if [[ $EXPECTED_ISOLCPUS = . ]] + then + EXPECTED_ISOLCPUS= + EXPECTED_SDOMAIN= + elif [[ $(expr $EXPECTED_ISOLCPUS : ".*|.*") > 0 ]] + then + set -- $(echo $EXPECTED_ISOLCPUS | sed -e "s/|/ /g") + EXPECTED_ISOLCPUS=$2 + EXPECTED_SDOMAIN=$1 + else + EXPECTED_SDOMAIN=$EXPECTED_ISOLCPUS + fi + + # + # Appending pre-isolated CPUs + # Even though CPU #8 isn't used for testing, it can't be pre-isolated + # to make appending those CPUs easier. + # + [[ -n "$BOOT_ISOLCPUS" ]] && { + EXPECTED_ISOLCPUS=${EXPECTED_ISOLCPUS:+${EXPECTED_ISOLCPUS},}${BOOT_ISOLCPUS} + EXPECTED_SDOMAIN=${EXPECTED_SDOMAIN:+${EXPECTED_SDOMAIN},}${BOOT_ISOLCPUS} + } + + # + # Check cpuset.cpus.isolated cpumask + # + [[ "$EXPECTED_ISOLCPUS" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 + ISOLCPUS=$(cat $ISCPUS) + } + [[ "$EXPECTED_ISOLCPUS" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= + EXPECTED_ISOLCPUS=$EXPECTED_SDOMAIN + + # + # Use the sched domain in debugfs to check isolated CPUs, if available + # + [[ -d $SCHED_DOMAINS ]] || return 0 + + for ((CPU=0; CPU < $NR_CPUS; CPU++)) + do + [[ -n "$(ls ${SCHED_DOMAINS}/cpu$CPU)" ]] && continue + + if [[ -z "$LASTISOLCPU" ]] + then + ISOLCPUS=$CPU + LASTISOLCPU=$CPU + elif [[ "$LASTISOLCPU" -eq $((CPU - 1)) ]] + then + echo $ISOLCPUS | grep -q "\<$LASTISOLCPU\$" + if [[ $? -eq 0 ]] + then + ISOLCPUS=${ISOLCPUS}- + fi + LASTISOLCPU=$CPU + else + if [[ $ISOLCPUS = *- ]] + then + ISOLCPUS=${ISOLCPUS}$LASTISOLCPU + fi + ISOLCPUS=${ISOLCPUS},$CPU + LASTISOLCPU=$CPU + fi + done + [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU + + [[ "$EXPECTED_SDOMAIN" = "$ISOLCPUS" ]] +} + +test_fail() +{ + TESTNUM=$1 + TESTTYPE=$2 + ADDINFO=$3 + echo "Test $TEST[$TESTNUM] failed $TESTTYPE check!" + [[ -n "$ADDINFO" ]] && echo "*** $ADDINFO ***" + eval echo \${$TEST[$I]} + echo + dump_states + exit 1 +} + +# +# Check to see if there are unexpected isolated CPUs left beyond the boot +# time isolated ones. +# +null_isolcpus_check() +{ + [[ $VERBOSE -gt 0 ]] || return 0 + # Retry a few times before printing error + RETRY=0 + while [[ $RETRY -lt 8 ]] + do + pause 0.02 + check_isolcpus "." + [[ $? -eq 0 ]] && return 0 + ((RETRY++)) + done + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 +} + +# +# Check state transition test result +# $1 - Test number +# $2 - Expected effective CPU values +# $3 - Expected partition states +# $4 - Expected isolated CPUs +# +check_test_results() +{ + _NR=$1 + _ECPUS="$2" + _PSTATES="$3" + _ISOLCPUS="$4" + + [[ -n "$_ECPUS" && "$_ECPUS" != . ]] && { + check_effective_cpus $_ECPUS + [[ $? -ne 0 ]] && test_fail $_NR "effective CPU" \ + "Cgroup $CGRP: expected $EXPECTED_CPUS, got $ACTUAL_CPUS" + } + + [[ -n "$_PSTATES" && "$_PSTATES" != . ]] && { + check_cgroup_states $_PSTATES + [[ $? -ne 0 ]] && test_fail $_NR states \ + "Cgroup $CGRP: expected $EXPECTED_STATE, got $ACTUAL_STATE" + } + + # Compare the expected isolated CPUs with the actual ones, + # if available + [[ -n "$_ISOLCPUS" ]] && { + check_isolcpus $_ISOLCPUS + [[ $? -ne 0 ]] && { + [[ -n "$BOOT_ISOLCPUS" ]] && _ISOLCPUS=${_ISOLCPUS},${BOOT_ISOLCPUS} + test_fail $_NR "isolated CPU" \ + "Expect $_ISOLCPUS, get $ISOLCPUS instead" + } + } + reset_cgroup_states + # + # Check to see if effective cpu list changes + # + _NEWLIST=$(cat $CGROUP2/cpuset.cpus.effective) + RETRY=0 + while [[ $_NEWLIST != $CPULIST && $RETRY -lt 8 ]] + do + # Wait a bit longer & recheck a few times + pause 0.02 + ((RETRY++)) + _NEWLIST=$(cat $CGROUP2/cpuset.cpus.effective) + done + [[ $_NEWLIST != $CPULIST ]] && { + echo "Effective cpus changed to $_NEWLIST after test $_NR!" + exit 1 + } + null_isolcpus_check + [[ $VERBOSE -gt 0 ]] && echo "Test $I done." +} + +# # Run cpuset state transition test # $1 - test matrix name # @@ -529,91 +933,189 @@ run_state_test() { TEST=$1 CONTROLLER=cpuset - CPULIST=0-6 + CGROUP_LIST=". A1 A1/A2 A1/A2/A3 B1" + RESET_LIST="A1/A2/A3 A1/A2 A1 B1" I=0 eval CNT="\${#$TEST[@]}" reset_cgroup_states - echo $CPULIST > cpuset.cpus - echo root > cpuset.cpus.partition console_msg "Running state transition test ..." while [[ $I -lt $CNT ]] do - echo "Running test $I ..." > /dev/console + echo "Running test $I ..." > $CONSOLE + [[ $VERBOSE -gt 1 ]] && { + echo "" + eval echo \${$TEST[$I]} + } eval set -- "\${$TEST[$I]}" - ROOT=$1 - OLD_A1=$2 - OLD_A2=$3 - OLD_A3=$4 - OLD_B1=$5 - NEW_A1=$6 - NEW_A2=$7 - NEW_A3=$8 - NEW_B1=$9 - RESULT=${10} - ECPUS=${11} - STATES=${12} - - set_ctrl_state_noerr . $ROOT + OLD_A1=$1 + OLD_A2=$2 + OLD_A3=$3 + OLD_B1=$4 + NEW_A1=$5 + NEW_A2=$6 + NEW_A3=$7 + NEW_B1=$8 + RESULT=$9 + ECPUS=${10} + STATES=${11} + ICPUS=${12} + set_ctrl_state_noerr A1 $OLD_A1 set_ctrl_state_noerr A1/A2 $OLD_A2 set_ctrl_state_noerr A1/A2/A3 $OLD_A3 set_ctrl_state_noerr B1 $OLD_B1 + RETVAL=0 set_ctrl_state A1 $NEW_A1; ((RETVAL += $?)) set_ctrl_state A1/A2 $NEW_A2; ((RETVAL += $?)) set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?)) set_ctrl_state B1 $NEW_B1; ((RETVAL += $?)) - [[ $RETVAL -ne $RESULT ]] && { - echo "Test $TEST[$I] failed result check!" - eval echo \"\${$TEST[$I]}\" - dump_states - online_cpus - exit 1 - } + [[ $RETVAL -ne $RESULT ]] && test_fail $I result - [[ -n "$ECPUS" && "$ECPUS" != . ]] && { - check_effective_cpus $ECPUS - [[ $? -ne 0 ]] && { - echo "Test $TEST[$I] failed effective CPU check!" - eval echo \"\${$TEST[$I]}\" - echo - dump_states - online_cpus - exit 1 - } - } + check_test_results $I "$ECPUS" "$STATES" "$ICPUS" + ((I++)) + done + echo "All $I tests of $TEST PASSED." +} - [[ -n "$STATES" ]] && { - check_cgroup_states $STATES - [[ $? -ne 0 ]] && { - echo "FAILED: Test $TEST[$I] failed states check!" - eval echo \"\${$TEST[$I]}\" - echo - dump_states - online_cpus - exit 1 - } - } +# +# Run cpuset remote partition state transition test +# $1 - test matrix name +# +run_remote_state_test() +{ + TEST=$1 + CONTROLLER=cpuset + [[ -d rtest ]] || mkdir rtest + cd rtest + echo +cpuset > cgroup.subtree_control + echo "1-7" > cpuset.cpus + echo "1-7" > cpuset.cpus.exclusive + CGROUP_LIST=".. . p1 p2 p1/c11 p1/c12 p2/c21 p2/c22" + RESET_LIST="p1/c11 p1/c12 p2/c21 p2/c22 p1 p2" + I=0 + eval CNT="\${#$TEST[@]}" - reset_cgroup_states - # - # Check to see if effective cpu list changes - # - pause 0.05 - NEWLIST=$(cat cpuset.cpus.effective) - [[ $NEWLIST != $CPULIST ]] && { - echo "Effective cpus changed to $NEWLIST after test $I!" - exit 1 + reset_cgroup_states + console_msg "Running remote partition state transition test ..." + + while [[ $I -lt $CNT ]] + do + echo "Running test $I ..." > $CONSOLE + [[ $VERBOSE -gt 1 ]] && { + echo "" + eval echo \${$TEST[$I]} } - [[ -n "$VERBOSE" ]] && echo "Test $I done." + eval set -- "\${$TEST[$I]}" + OLD_p1=$1 + OLD_p2=$2 + OLD_c11=$3 + OLD_c12=$4 + OLD_c21=$5 + OLD_c22=$6 + NEW_p1=$7 + NEW_p2=$8 + NEW_c11=$9 + NEW_c12=${10} + NEW_c21=${11} + NEW_c22=${12} + ECPUS=${13} + STATES=${14} + ICPUS=${15} + + set_ctrl_state_noerr p1 $OLD_p1 + set_ctrl_state_noerr p2 $OLD_p2 + set_ctrl_state_noerr p1/c11 $OLD_c11 + set_ctrl_state_noerr p1/c12 $OLD_c12 + set_ctrl_state_noerr p2/c21 $OLD_c21 + set_ctrl_state_noerr p2/c22 $OLD_c22 + + RETVAL=0 + set_ctrl_state p1 $NEW_p1 ; ((RETVAL += $?)) + set_ctrl_state p2 $NEW_p2 ; ((RETVAL += $?)) + set_ctrl_state p1/c11 $NEW_c11; ((RETVAL += $?)) + set_ctrl_state p1/c12 $NEW_c12; ((RETVAL += $?)) + set_ctrl_state p2/c21 $NEW_c21; ((RETVAL += $?)) + set_ctrl_state p2/c22 $NEW_c22; ((RETVAL += $?)) + + [[ $RETVAL -ne 0 ]] && test_fail $I result + + check_test_results $I "$ECPUS" "$STATES" "$ICPUS" ((I++)) done + cd .. + rmdir rtest echo "All $I tests of $TEST PASSED." +} - echo member > cpuset.cpus.partition +# +# Testing the new "isolated" partition root type +# +test_isolated() +{ + cd $CGROUP2/test + echo 2-3 > cpuset.cpus + TYPE=$(cat cpuset.cpus.partition) + [[ $TYPE = member ]] || echo member > cpuset.cpus.partition + + console_msg "Change from member to root" + test_partition root + + console_msg "Change from root to isolated" + test_partition isolated + + console_msg "Change from isolated to member" + test_partition member + + console_msg "Change from member to isolated" + test_partition isolated + + console_msg "Change from isolated to root" + test_partition root + + console_msg "Change from root to member" + test_partition member + + # + # Testing partition root with no cpu + # + console_msg "Distribute all cpus to child partition" + echo +cpuset > cgroup.subtree_control + test_partition root + + mkdir A1 + cd A1 + echo 2-3 > cpuset.cpus + test_partition root + test_effective_cpus 2-3 + cd .. + test_effective_cpus "" + + console_msg "Moving task to partition test" + test_add_proc "No space left" + cd A1 + test_add_proc "" + cd .. + + console_msg "Shrink and expand child partition" + cd A1 + echo 2 > cpuset.cpus + cd .. + test_effective_cpus 3 + cd A1 + echo 2-3 > cpuset.cpus + cd .. + test_effective_cpus "" + + # Cleaning up + console_msg "Cleaning up" + echo $$ > $CGROUP2/cgroup.procs + [[ -d A1 ]] && rmdir A1 + null_isolcpus_check + pause 0.05 } # @@ -638,6 +1140,7 @@ test_inotify() { ERR=0 PRS=/tmp/.prs_$$ + cd $CGROUP2/test [[ -f $WAIT_INOTIFY ]] || { echo "wait_inotify not found, inotify test SKIPPED." return @@ -651,7 +1154,7 @@ test_inotify() rm -f $PRS wait_inotify $PWD/cpuset.cpus.partition $PRS & pause 0.01 - set_ctrl_state . "O1-0" + set_ctrl_state . "O1=0" pause 0.01 check_cgroup_states ".:P-1" if [[ $? -ne 0 ]] @@ -678,12 +1181,13 @@ test_inotify() else echo "Inotify test PASSED" fi + echo member > cpuset.cpus.partition + echo "" > cpuset.cpus } trap cleanup 0 2 3 6 run_state_test TEST_MATRIX +run_remote_state_test REMOTE_TEST_MATRIX test_isolated test_inotify echo "All tests PASSED." -cd .. -rmdir test diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh new file mode 100755 index 000000000000..42a6628fb8bc --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Basc test for cpuset v1 interfaces write/read +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +write_test() { + dir=$1 + interface=$2 + value=$3 + original=$(cat $dir/$interface) + echo "testing $interface $value" + echo $value > $dir/$interface + new=$(cat $dir/$interface) + [[ $value -ne $(cat $dir/$interface) ]] && { + echo "$interface write $value failed: new:$new" + exit 1 + } +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, read write test +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR + +ITF_MATRIX=( + #interface value expect root_only + 'cpuset.cpus 0-1 0-1 0' + 'cpuset.mem_exclusive 1 1 0' + 'cpuset.mem_exclusive 0 0 0' + 'cpuset.mem_hardwall 1 1 0' + 'cpuset.mem_hardwall 0 0 0' + 'cpuset.memory_migrate 1 1 0' + 'cpuset.memory_migrate 0 0 0' + 'cpuset.memory_spread_page 1 1 0' + 'cpuset.memory_spread_page 0 0 0' + 'cpuset.memory_spread_slab 1 1 0' + 'cpuset.memory_spread_slab 0 0 0' + 'cpuset.mems 0 0 0' + 'cpuset.sched_load_balance 1 1 0' + 'cpuset.sched_load_balance 0 0 0' + 'cpuset.sched_relax_domain_level 2 2 0' + 'cpuset.memory_pressure_enabled 1 1 1' + 'cpuset.memory_pressure_enabled 0 0 1' +) + +run_test() +{ + cnt="${ITF_MATRIX[@]}" + for i in "${ITF_MATRIX[@]}" ; do + args=($i) + root_only=${args[3]} + [[ $root_only -eq 1 ]] && { + write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}" + continue + } + write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}" + done +} + +run_test +rmdir $CPUSET/$TDIR +echo "Test PASSED" +exit 0 diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh new file mode 100755 index 000000000000..7406c24be1ac --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test the special cpuset v1 hotplug case where a cpuset become empty of +# CPUs will force migration of tasks out to an ancestor. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, put a CPU and a task there and offline that CPU +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR +echo 1 > $CPUSET/$TDIR/cpuset.cpus +echo 0 > $CPUSET/$TDIR/cpuset.mems +sleep 10& +TASK=$! +echo $TASK > $CPUSET/$TDIR/tasks +NEWCS=$(cat /proc/$TASK/cpuset) +[[ $NEWCS != "/$TDIR" ]] && { + echo "Unexpected cpuset $NEWCS, test FAILED!" + exit 1 +} + +echo 0 > /sys/devices/system/cpu/cpu1/online +sleep 0.5 +echo 1 > /sys/devices/system/cpu/cpu1/online +NEWCS=$(cat /proc/$TASK/cpuset) +rmdir $CPUSET/$TDIR +[[ $NEWCS != "/" ]] && { + echo "cpuset $NEWCS, test FAILED!" + exit 1 +} +echo "Test PASSED" +exit 0 diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index ff519029f6f4..97fae92c8387 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -11,7 +11,7 @@ #include <string.h> #include <sys/wait.h> -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" #define DEBUG @@ -740,7 +740,7 @@ static int test_cgfreezer_ptraced(const char *root) /* * cg_check_frozen(cgroup, true) will fail here, - * because the task in in the TRACEd state. + * because the task is in the TRACEd state. */ if (cg_freeze_wait(cgroup, false)) goto cleanup; @@ -804,6 +804,662 @@ cleanup: return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,15 +1475,24 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; - if (cg_find_unified_root(root, sizeof(root))) + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { @@ -838,11 +1503,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c new file mode 100644 index 000000000000..f451aa449be6 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <linux/limits.h> +#include <sys/mman.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include "kselftest.h" +#include "cgroup_util.h" + +#define ADDR ((void *)(0x0UL)) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +/* mapping 8 MBs == 4 hugepages */ +#define LENGTH (8UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* borrowed from mm/hmm-tests.c */ +static long get_hugepage_size(void) +{ + int fd; + char buf[2048]; + int len; + char *p, *q, *path = "/proc/meminfo", *tag = "Hugepagesize:"; + long val; + + fd = open(path, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +static int set_file(const char *path, long value) +{ + FILE *file; + int ret; + + file = fopen(path, "w"); + if (!file) + return -1; + ret = fprintf(file, "%ld\n", value); + fclose(file); + return ret; +} + +static int set_nr_hugepages(long value) +{ + return set_file("/proc/sys/vm/nr_hugepages", value); +} + +static unsigned int check_first(char *addr) +{ + return *(unsigned int *)addr; +} + +static void write_data(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int hugetlb_test_program(const char *cgroup, void *arg) +{ + char *test_group = (char *)arg; + void *addr; + long old_current, expected_current, current; + int ret = EXIT_FAILURE; + + old_current = cg_read_long(test_group, "memory.current"); + set_nr_hugepages(20); + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg( + "setting nr_hugepages should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + return EXIT_FAILURE; + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); + if (addr == MAP_FAILED) { + ksft_print_msg("fail to mmap.\n"); + return EXIT_FAILURE; + } + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg("mmap should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + goto out_failed_munmap; + } + old_current = current; + + /* read the first page */ + check_first(addr); + expected_current = old_current + MB(2); + current = cg_read_long(test_group, "memory.current"); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 2MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* write to the whole range */ + write_data(addr); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current + MB(8); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 8MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* unmap the whole range */ + munmap(addr, LENGTH); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current; + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should go back down.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + return ret; + } + + ret = EXIT_SUCCESS; + return ret; + +out_failed_munmap: + munmap(addr, LENGTH); + return ret; +} + +static int test_hugetlb_memcg(char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "hugetlb_memcg_test"); + if (!test_group || cg_create(test_group)) { + ksft_print_msg("fail to create cgroup.\n"); + goto out; + } + + if (cg_write(test_group, "memory.max", "100M")) { + ksft_print_msg("fail to set cgroup memory limit.\n"); + goto out; + } + + /* disable swap */ + if (cg_write(test_group, "memory.swap.max", "0")) { + ksft_print_msg("fail to disable swap.\n"); + goto out; + } + + if (!cg_run(test_group, hugetlb_test_program, (void *)test_group)) + ret = KSFT_PASS; +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + int ret = EXIT_SUCCESS, has_memory_hugetlb_acc; + + has_memory_hugetlb_acc = proc_mount_contains("memory_hugetlb_accounting"); + if (has_memory_hugetlb_acc < 0) + ksft_exit_skip("Failed to query cgroup mount option\n"); + else if (!has_memory_hugetlb_acc) + ksft_exit_skip("memory hugetlb accounting is disabled\n"); + + /* Unit is kB! */ + if (get_hugepage_size() != 2048) { + ksft_print_msg("test_hugetlb_memcg requires 2MB hugepages\n"); + ksft_test_result_skip("test_hugetlb_memcg\n"); + return ret; + } + + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + switch (test_hugetlb_memcg(root)) { + case KSFT_PASS: + ksft_test_result_pass("test_hugetlb_memcg\n"); + break; + case KSFT_SKIP: + ksft_test_result_skip("test_hugetlb_memcg\n"); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("test_hugetlb_memcg\n"); + break; + } + + return ret; +} diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 6153690319c9..c8c9d306925b 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -9,7 +9,7 @@ #include <sys/types.h> #include <unistd.h> -#include "../kselftest.h" +#include "kselftest.h" #include "../pidfd/pidfd.h" #include "cgroup_util.h" @@ -274,9 +274,11 @@ struct cgkill_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; - if (cg_find_unified_root(root, sizeof(root))) + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { @@ -287,11 +289,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 258ddc565deb..ca38525484e3 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -14,7 +14,7 @@ #include <sys/sysinfo.h> #include <pthread.h> -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" @@ -70,12 +70,16 @@ static int test_kmem_basic(const char *root) goto cleanup; cg_write(cg, "memory.high", "1M"); + + /* wait for RCU freeing */ + sleep(1); + slab1 = cg_read_key_long(cg, "memory.stat", "slab "); - if (slab1 <= 0) + if (slab1 < 0) goto cleanup; current = cg_read_long(cg, "memory.current"); - if (current <= 0) + if (current < 0) goto cleanup; if (slab1 < slab0 / 2 && current < slab0 / 2) @@ -158,11 +162,11 @@ static int cg_run_in_subcgroups(const char *parent, * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS * threads. Then it checks the sanity of numbers on the parent level: * the total size of the cgroups should be roughly equal to - * anon + file + slab + kernel_stack. + * anon + file + kernel + sock. */ static int test_kmem_memcg_deletion(const char *root) { - long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum; + long current, anon, file, kernel, sock, sum; int ret = KSFT_FAIL; char *parent; @@ -180,29 +184,22 @@ static int test_kmem_memcg_deletion(const char *root) goto cleanup; current = cg_read_long(parent, "memory.current"); - slab = cg_read_key_long(parent, "memory.stat", "slab "); anon = cg_read_key_long(parent, "memory.stat", "anon "); file = cg_read_key_long(parent, "memory.stat", "file "); - kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack "); - pagetables = cg_read_key_long(parent, "memory.stat", "pagetables "); - percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + kernel = cg_read_key_long(parent, "memory.stat", "kernel "); sock = cg_read_key_long(parent, "memory.stat", "sock "); - if (current < 0 || slab < 0 || anon < 0 || file < 0 || - kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0) + if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0) goto cleanup; - sum = slab + anon + file + kernel_stack + pagetables + percpu + sock; - if (abs(sum - current) < MAX_VMSTAT_ERROR) { + sum = anon + file + kernel + sock; + if (labs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { printf("memory.current = %ld\n", current); - printf("slab + anon + file + kernel_stack = %ld\n", sum); - printf("slab = %ld\n", slab); + printf("anon + file + kernel + sock = %ld\n", sum); printf("anon = %ld\n", anon); printf("file = %ld\n", file); - printf("kernel_stack = %ld\n", kernel_stack); - printf("pagetables = %ld\n", pagetables); - printf("percpu = %ld\n", percpu); + printf("kernel = %ld\n", kernel); printf("sock = %ld\n", sock); } @@ -311,6 +308,7 @@ static int test_kmem_dead_cgroups(const char *root) char *parent; long dead; int i; + int max_time = 20; parent = cg_name(root, "kmem_dead_cgroups_test"); if (!parent) @@ -325,7 +323,7 @@ static int test_kmem_dead_cgroups(const char *root) if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) goto cleanup; - for (i = 0; i < 5; i++) { + for (i = 0; i < max_time; i++) { dead = cg_read_key_long(parent, "cgroup.stat", "nr_dying_descendants "); if (dead == 0) { @@ -337,6 +335,8 @@ static int test_kmem_dead_cgroups(const char *root) * let's wait a bit and repeat. */ sleep(1); + if (i > 5) + printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); } cleanup: @@ -383,7 +383,7 @@ static int test_percpu_basic(const char *root) current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); - if (current > 0 && percpu > 0 && abs(current - percpu) < + if (current > 0 && percpu > 0 && labs(current - percpu) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else @@ -421,9 +421,11 @@ struct kmem_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; - if (cg_find_unified_root(root, sizeof(root))) + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* @@ -446,11 +448,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 1e616a8c6a9c..4e1647568c5b 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -18,12 +18,90 @@ #include <errno.h> #include <sys/mman.h> -#include "../kselftest.h" +#include "kselftest.h" #include "cgroup_util.h" static bool has_localevents; static bool has_recursiveprot; +int get_temp_fd(void) +{ + return open(".", O_TMPFILE | O_RDWR | O_EXCL); +} + +int alloc_pagecache(int fd, size_t size) +{ + char buf[PAGE_SIZE]; + struct stat st; + int i; + + if (fstat(fd, &st)) + goto cleanup; + + size += st.st_size; + + if (ftruncate(fd, size)) + goto cleanup; + + for (i = 0; i < size; i += sizeof(buf)) + read(fd, buf, sizeof(buf)); + + return 0; + +cleanup: + return -1; +} + +int alloc_anon(const char *cgroup, void *arg) +{ + size_t size = (unsigned long)arg; + char *buf, *ptr; + + buf = malloc(size); + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 0; + + free(buf); + return 0; +} + +int is_swap_enabled(void) +{ + char buf[PAGE_SIZE]; + const char delim[] = "\n"; + int cnt = 0; + char *line; + + if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) + return -1; + + for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) + cnt++; + + return cnt > 1; +} + +int set_oom_adj_score(int pid, int score) +{ + char path[PATH_MAX]; + int fd, len; + + sprintf(path, "/proc/%d/oom_score_adj", pid); + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return fd; + + len = dprintf(fd, "%d", score); + if (len < 0) { + close(fd); + return len; + } + + close(fd); + return 0; +} + /* * This test creates two nested cgroups with and without enabling * the memory controller. @@ -98,6 +176,11 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -156,13 +239,16 @@ cleanup: /* * This test create a memory cgroup, allocates * some anonymous memory and some pagecache - * and check memory.current and some memory.stat values. + * and checks memory.current, memory.peak, and some memory.stat values. */ -static int test_memcg_current(const char *root) +static int test_memcg_current_peak(const char *root) { int ret = KSFT_FAIL; - long current; + long current, peak, peak_reset; char *memcg; + bool fd2_closed = false, fd3_closed = false, fd4_closed = false; + int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1; + struct stat ss; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -175,15 +261,124 @@ static int test_memcg_current(const char *root) if (current != 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak != 0) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* + * We'll open a few FDs for the same memory.peak file to exercise the free-path + * We need at least three to be closed in a different order than writes occurred to test + * the linked-list handling. + */ + peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.peak's fd, try to figure out whether + * this kernel supports writing to that file in the first place. (by + * checking the writable bit on the file's st_mode) + */ + if (fstat(peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd2 == -1) + goto cleanup; + + peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd3 == -1) + goto cleanup; + + /* any non-empty string resets, but make it clear */ + static const char reset_string[] = "reset\n"; + + peak_reset = write(peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd2, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd3, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + /* Make sure a completely independent read isn't affected by our FD-local reset above*/ + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + fd2_closed = true; + if (close(peak_fd2)) + goto cleanup; + + peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd4 == -1) + goto cleanup; + + peak_reset = write(peak_fd4, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd); + if (peak > MB(30) || peak < 0) + goto cleanup; + if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* Make sure everything is back to normal */ + peak = cg_read_long_fd(peak_fd); + if (peak < MB(50)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd4); + if (peak < MB(50)) + goto cleanup; + + fd3_closed = true; + if (close(peak_fd3)) + goto cleanup; + + fd4_closed = true; + if (close(peak_fd4)) + goto cleanup; + ret = KSFT_PASS; cleanup: + close(peak_fd); + if (!fd2_closed) + close(peak_fd2); + if (!fd3_closed) + close(peak_fd3); + if (!fd4_closed) + close(peak_fd4); cg_destroy(memcg); free(memcg); @@ -211,6 +406,11 @@ static int alloc_anon_noexit(const char *cgroup, void *arg) char *buf, *ptr; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -258,10 +458,11 @@ static bool reclaim_until(const char *memcg, long goal); * * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M - * A/B/C memory.current ~= 29M - * A/B/D memory.current ~= 21M - * A/B/E memory.current ~= 0 - * A/B/F memory.current = 0 + * A/B/C memory.current ~= 29M [memory.events:low > 0] + * A/B/D memory.current ~= 21M [memory.events:low > 0] + * A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot, + * undefined otherwise] + * A/B/F memory.current = 0 [memory.events:low == 0] * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is @@ -282,6 +483,7 @@ static int test_memcg_protection(const char *root, bool min) char *children[4] = {NULL}; const char *attribute = min ? "memory.min" : "memory.low"; long c[4]; + long current; int i, attempts; int fd; @@ -372,10 +574,10 @@ static int test_memcg_protection(const char *root, bool min) for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); - if (!values_close(c[0], MB(29), 10)) + if (!values_close(c[0], MB(29), 15)) goto cleanup; - if (!values_close(c[1], MB(21), 10)) + if (!values_close(c[1], MB(21), 20)) goto cleanup; if (c[3] != 0) @@ -390,7 +592,8 @@ static int test_memcg_protection(const char *root, bool min) goto cleanup; } - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) + current = min ? MB(50) : MB(30); + if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3)) goto cleanup; if (!reclaim_until(children[0], MB(10))) @@ -401,7 +604,14 @@ static int test_memcg_protection(const char *root, bool min) goto cleanup; } + /* + * Child 2 has memory.low=0, but some low protection may still be + * distributed down from its parent with memory.low=50M if cgroup2 + * memory_recursiveprot mount option is enabled. Ignore the low + * event count in this case. + */ for (i = 0; i < ARRAY_SIZE(children); i++) { + int ignore_low_events_index = has_recursiveprot ? 2 : -1; int no_low_events_index = 1; long low, oom; @@ -410,6 +620,8 @@ static int test_memcg_protection(const char *root, bool min) if (oom) goto cleanup; + if (i == ignore_low_events_index) + continue; if (i <= no_low_events_index && low <= 0) goto cleanup; if (i > no_low_events_index && low) @@ -704,7 +916,9 @@ static bool reclaim_until(const char *memcg, long goal) */ static int test_memcg_reclaim(const char *root) { - int ret = KSFT_FAIL, fd, retries; + int ret = KSFT_FAIL; + int fd = -1; + int retries; char *memcg; long current, expected_usage; @@ -778,6 +992,11 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) int ret = -1; buf = malloc(size); + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return -1; + } + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; @@ -798,13 +1017,19 @@ cleanup: /* * This test checks that memory.swap.max limits the amount of - * anonymous memory which can be swapped out. + * anonymous memory which can be swapped out. Additionally, it verifies that + * memory.swap.peak reflects the high watermark and can be reset. */ -static int test_memcg_swap_max(const char *root) +static int test_memcg_swap_max_peak(const char *root) { int ret = KSFT_FAIL; char *memcg; - long max; + long max, peak; + struct stat ss; + int swap_peak_fd = -1, mem_peak_fd = -1; + + /* any non-empty string resets */ + static const char reset_string[] = "foobarbaz"; if (!is_swap_enabled()) return KSFT_SKIP; @@ -821,6 +1046,61 @@ static int test_memcg_swap_max(const char *root) goto cleanup; } + swap_peak_fd = cg_open(memcg, "memory.swap.peak", + O_RDWR | O_APPEND | O_CLOEXEC); + + if (swap_peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.swap.peak's fd, try to figure out + * whether this kernel supports writing to that file in the first + * place. (by checking the writable bit on the file's st_mode) + */ + if (fstat(swap_peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (mem_peak_fd == -1) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak")) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + /* switch the swap and mem fds into local-peak tracking mode*/ + int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak")) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + if (cg_read_strcmp(memcg, "memory.max", "max\n")) goto cleanup; @@ -843,6 +1123,61 @@ static int test_memcg_swap_max(const char *root) if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(29)) + goto cleanup; + + /* + * open, reset and close the peak swap on another FD to make sure + * multiple extant fds don't corrupt the linked-list + */ + peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + /* actually reset on the fds */ + peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak > MB(10)) + goto cleanup; + + /* + * The cgroup is now empty, but there may be a page or two associated + * with the open FD accounted to it. + */ + peak = cg_read_long_fd(mem_peak_fd); + if (peak > MB(1)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak") < MB(29)) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak") < MB(29)) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) goto cleanup; @@ -850,9 +1185,29 @@ static int test_memcg_swap_max(const char *root) if (max <= 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(19)) + goto cleanup; + ret = KSFT_PASS; cleanup: + if (mem_peak_fd != -1 && close(mem_peak_fd)) + ret = KSFT_FAIL; + if (swap_peak_fd != -1 && close(swap_peak_fd)) + ret = KSFT_FAIL; cg_destroy(memcg); free(memcg); @@ -972,7 +1327,9 @@ static int tcp_client(const char *cgroup, unsigned short port) char servport[6]; int retries = 0x10; /* nice round number */ int sk, ret; + long allocated; + allocated = cg_read_long(cgroup, "memory.current"); snprintf(servport, sizeof(servport), "%hd", port); ret = getaddrinfo(server, servport, NULL, &ai); if (ret) @@ -1000,7 +1357,8 @@ static int tcp_client(const char *cgroup, unsigned short port) if (current < 0 || sock < 0) goto close_sk; - if (values_close(current, sock, 10)) { + /* exclude the memory not related to socket connection */ + if (values_close(current - allocated, sock, 10)) { ret = KSFT_PASS; break; } @@ -1273,7 +1631,7 @@ struct memcg_test { const char *name; } tests[] = { T(test_memcg_subtree_control), - T(test_memcg_current), + T(test_memcg_current_peak), T(test_memcg_min), T(test_memcg_low), T(test_memcg_high), @@ -1281,7 +1639,7 @@ struct memcg_test { T(test_memcg_max), T(test_memcg_reclaim), T(test_memcg_oom_events), - T(test_memcg_swap_max), + T(test_memcg_swap_max_peak), T(test_memcg_sock), T(test_memcg_oom_group_leaf_events), T(test_memcg_oom_group_parent_events), @@ -1292,9 +1650,11 @@ struct memcg_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, proc_status, ret = EXIT_SUCCESS; + int i, proc_status; - if (cg_find_unified_root(root, sizeof(root))) + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* @@ -1327,11 +1687,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c new file mode 100644 index 000000000000..9a387c815d2c --- /dev/null +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <errno.h> +#include <linux/limits.h> +#include <signal.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "kselftest.h" +#include "cgroup_util.h" + +static int run_success(const char *cgroup, void *arg) +{ + return 0; +} + +static int run_pause(const char *cgroup, void *arg) +{ + return pause(); +} + +/* + * This test checks that pids.max prevents forking new children above the + * specified limit in the cgroup. + */ +static int test_pids_max(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_pids; + int pid; + + cg_pids = cg_name(root, "pids_test"); + if (!cg_pids) + goto cleanup; + + if (cg_create(cg_pids)) + goto cleanup; + + if (cg_read_strcmp(cg_pids, "pids.max", "max\n")) + goto cleanup; + + if (cg_write(cg_pids, "pids.max", "2")) + goto cleanup; + + if (cg_enter_current(cg_pids)) + goto cleanup; + + pid = cg_run_nowait(cg_pids, run_pause, NULL); + if (pid < 0) + goto cleanup; + + if (cg_run_nowait(cg_pids, run_success, NULL) != -1 || errno != EAGAIN) + goto cleanup; + + if (kill(pid, SIGINT)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + cg_destroy(cg_pids); + free(cg_pids); + + return ret; +} + +/* + * This test checks that pids.events are counted in cgroup associated with pids.max + */ +static int test_pids_events(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_parent = NULL, *cg_child = NULL; + int pid; + + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + + cg_parent = cg_name(root, "pids_parent"); + cg_child = cg_name(cg_parent, "pids_child"); + if (!cg_parent || !cg_child) + goto cleanup; + + if (cg_create(cg_parent)) + goto cleanup; + if (cg_write(cg_parent, "cgroup.subtree_control", "+pids")) + goto cleanup; + if (cg_create(cg_child)) + goto cleanup; + + if (cg_write(cg_parent, "pids.max", "2")) + goto cleanup; + + if (cg_read_strcmp(cg_child, "pids.max", "max\n")) + goto cleanup; + + if (cg_enter_current(cg_child)) + goto cleanup; + + pid = cg_run_nowait(cg_child, run_pause, NULL); + if (pid < 0) + goto cleanup; + + if (cg_run_nowait(cg_child, run_success, NULL) != -1 || errno != EAGAIN) + goto cleanup; + + if (kill(pid, SIGINT)) + goto cleanup; + + if (cg_read_key_long(cg_child, "pids.events", "max ") != 0) + goto cleanup; + if (cg_read_key_long(cg_parent, "pids.events", "max ") != 1) + goto cleanup; + + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_child) + cg_destroy(cg_child); + if (cg_parent) + cg_destroy(cg_parent); + free(cg_child); + free(cg_parent); + + return ret; +} + + + +#define T(x) { x, #x } +struct pids_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_pids_max), + T(test_pids_events), +}; +#undef T + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + /* + * Check that pids controller is available: + * pids is listed in cgroup.controllers + */ + if (cg_read_strstr(root, "cgroup.controllers", "pids")) + ksft_exit_skip("pids controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "pids")) + if (cg_write(root, "cgroup.subtree_control", "+pids")) + ksft_exit_skip("Failed to set pids controller\n"); + + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + ksft_finished(); +} diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c new file mode 100644 index 000000000000..64ebc3f3f203 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <linux/limits.h> +#include <unistd.h> +#include <stdio.h> +#include <signal.h> +#include <sys/sysinfo.h> +#include <string.h> +#include <sys/wait.h> +#include <sys/mman.h> + +#include "kselftest.h" +#include "cgroup_util.h" + +static int read_int(const char *path, size_t *value) +{ + FILE *file; + int ret = 0; + + file = fopen(path, "r"); + if (!file) + return -1; + if (fscanf(file, "%ld", value) != 1) + ret = -1; + fclose(file); + return ret; +} + +static int set_min_free_kb(size_t value) +{ + FILE *file; + int ret; + + file = fopen("/proc/sys/vm/min_free_kbytes", "w"); + if (!file) + return -1; + ret = fprintf(file, "%ld\n", value); + fclose(file); + return ret; +} + +static int read_min_free_kb(size_t *value) +{ + return read_int("/proc/sys/vm/min_free_kbytes", value); +} + +static int get_zswap_stored_pages(size_t *value) +{ + return read_int("/sys/kernel/debug/zswap/stored_pages", value); +} + +static long get_cg_wb_count(const char *cg) +{ + return cg_read_key_long(cg, "memory.stat", "zswpwb"); +} + +static long get_zswpout(const char *cgroup) +{ + return cg_read_key_long(cgroup, "memory.stat", "zswpout "); +} + +static int allocate_and_read_bytes(const char *cgroup, void *arg) +{ + size_t size = (size_t)arg; + char *mem = (char *)malloc(size); + int ret = 0; + + if (!mem) + return -1; + for (int i = 0; i < size; i += 4095) + mem[i] = 'a'; + + /* Go through the allocated memory to (z)swap in and out pages */ + for (int i = 0; i < size; i += 4095) { + if (mem[i] != 'a') + ret = -1; + } + + free(mem); + return ret; +} + +static int allocate_bytes(const char *cgroup, void *arg) +{ + size_t size = (size_t)arg; + char *mem = (char *)malloc(size); + + if (!mem) + return -1; + for (int i = 0; i < size; i += 4095) + mem[i] = 'a'; + free(mem); + return 0; +} + +static char *setup_test_group_1M(const char *root, const char *name) +{ + char *group_name = cg_name(root, name); + + if (!group_name) + return NULL; + if (cg_create(group_name)) + goto fail; + if (cg_write(group_name, "memory.max", "1M")) { + cg_destroy(group_name); + goto fail; + } + return group_name; +fail: + free(group_name); + return NULL; +} + +/* + * Sanity test to check that pages are written into zswap. + */ +static int test_zswap_usage(const char *root) +{ + long zswpout_before, zswpout_after; + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "no_shrink_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "1M")) + goto out; + + zswpout_before = get_zswpout(test_group); + if (zswpout_before < 0) { + ksft_print_msg("Failed to get zswpout\n"); + goto out; + } + + /* Allocate more than memory.max to push memory into zswap */ + if (cg_run(test_group, allocate_bytes, (void *)MB(4))) + goto out; + + /* Verify that pages come into zswap */ + zswpout_after = get_zswpout(test_group); + if (zswpout_after <= zswpout_before) { + ksft_print_msg("zswpout does not increase after test program\n"); + goto out; + } + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +/* + * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for + * the cgroup. + */ +static int test_swapin_nozswap(const char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + long swap_peak, zswpout; + + test_group = cg_name(root, "no_zswap_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "8M")) + goto out; + if (cg_write(test_group, "memory.zswap.max", "0")) + goto out; + + /* Allocate and read more than memory.max to trigger swapin */ + if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + goto out; + + /* Verify that pages are swapped out, but no zswap happened */ + swap_peak = cg_read_long(test_group, "memory.swap.peak"); + if (swap_peak < 0) { + ksft_print_msg("failed to get cgroup's swap_peak\n"); + goto out; + } + + if (swap_peak < MB(24)) { + ksft_print_msg("at least 24MB of memory should be swapped out\n"); + goto out; + } + + zswpout = get_zswpout(test_group); + if (zswpout < 0) { + ksft_print_msg("failed to get zswpout\n"); + goto out; + } + + if (zswpout > 0) { + ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +/* Simple test to verify the (z)swapin code paths */ +static int test_zswapin(const char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + long zswpin; + + test_group = cg_name(root, "zswapin_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "8M")) + goto out; + if (cg_write(test_group, "memory.zswap.max", "max")) + goto out; + + /* Allocate and read more than memory.max to trigger (z)swap in */ + if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + goto out; + + zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin "); + if (zswpin < 0) { + ksft_print_msg("failed to get zswpin\n"); + goto out; + } + + if (zswpin < MB(24) / PAGE_SIZE) { + ksft_print_msg("at least 24MB should be brought back from zswap\n"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +/* + * Attempt writeback with the following steps: + * 1. Allocate memory. + * 2. Reclaim memory equal to the amount that was allocated in step 1. + This will move it into zswap. + * 3. Save current zswap usage. + * 4. Move the memory allocated in step 1 back in from zswap. + * 5. Set zswap.max to half the amount that was recorded in step 3. + * 6. Attempt to reclaim memory equal to the amount that was allocated, + this will either trigger writeback if it's enabled, or reclamation + will fail if writeback is disabled as there isn't enough zswap space. + */ +static int attempt_writeback(const char *cgroup, void *arg) +{ + long pagesize = sysconf(_SC_PAGESIZE); + size_t memsize = MB(4); + char buf[pagesize]; + long zswap_usage; + bool wb_enabled = *(bool *) arg; + int ret = -1; + char *mem; + + mem = (char *)malloc(memsize); + if (!mem) + return ret; + + /* + * Fill half of each page with increasing data, and keep other + * half empty, this will result in data that is still compressible + * and ends up in zswap, with material zswap usage. + */ + for (int i = 0; i < pagesize; i++) + buf[i] = i < pagesize/2 ? (char) i : 0; + + for (int i = 0; i < memsize; i += pagesize) + memcpy(&mem[i], buf, pagesize); + + /* Try and reclaim allocated memory */ + if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { + ksft_print_msg("Failed to reclaim all of the requested memory\n"); + goto out; + } + + zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); + + /* zswpin */ + for (int i = 0; i < memsize; i += pagesize) { + if (memcmp(&mem[i], buf, pagesize)) { + ksft_print_msg("invalid memory\n"); + goto out; + } + } + + if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) + goto out; + + /* + * If writeback is enabled, trying to reclaim memory now will trigger a + * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * If writeback is disabled, memory reclaim will fail as zswap is limited and + * it can't writeback to swap. + */ + ret = cg_write_numeric(cgroup, "memory.reclaim", memsize); + if (!wb_enabled) + ret = (ret == -EAGAIN) ? 0 : -1; + +out: + free(mem); + return ret; +} + +static int test_zswap_writeback_one(const char *cgroup, bool wb) +{ + long zswpwb_before, zswpwb_after; + + zswpwb_before = get_cg_wb_count(cgroup); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + return -1; + } + + if (cg_run(cgroup, attempt_writeback, (void *) &wb)) + return -1; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(cgroup); + if (zswpwb_after < 0) + return -1; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s\n", + zswpwb_after, wb ? "enabled" : "disabled"); + return -1; + } + + return 0; +} + +/* Test to verify the zswap writeback path */ +static int test_zswap_writeback(const char *root, bool wb) +{ + int ret = KSFT_FAIL; + char *test_group, *test_group_child = NULL; + + if (cg_read_strcmp(root, "memory.zswap.writeback", "1")) + return KSFT_SKIP; + + test_group = cg_name(root, "zswap_writeback_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) + goto out; + + if (test_zswap_writeback_one(test_group, wb)) + goto out; + + /* Reset memory.zswap.max to max (modified by attempt_writeback), and + * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1. + * Thus, the parent's setting shall be what's in effect. */ + if (cg_write(test_group, "memory.zswap.max", "max")) + goto out; + if (cg_write(test_group, "cgroup.subtree_control", "+memory")) + goto out; + + test_group_child = cg_name(test_group, "zswap_writeback_test_child"); + if (!test_group_child) + goto out; + if (cg_create(test_group_child)) + goto out; + if (cg_write(test_group_child, "memory.zswap.writeback", "1")) + goto out; + + if (test_zswap_writeback_one(test_group_child, wb)) + goto out; + + ret = KSFT_PASS; + +out: + if (test_group_child) { + cg_destroy(test_group_child); + free(test_group_child); + } + cg_destroy(test_group); + free(test_group); + return ret; +} + +static int test_zswap_writeback_enabled(const char *root) +{ + return test_zswap_writeback(root, true); +} + +static int test_zswap_writeback_disabled(const char *root) +{ + return test_zswap_writeback(root, false); +} + +/* + * When trying to store a memcg page in zswap, if the memcg hits its memory + * limit in zswap, writeback should affect only the zswapped pages of that + * memcg. + */ +static int test_no_invasive_cgroup_shrink(const char *root) +{ + int ret = KSFT_FAIL; + size_t control_allocation_size = MB(10); + char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; + + wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); + if (!wb_group) + return KSFT_FAIL; + if (cg_write(wb_group, "memory.zswap.max", "10K")) + goto out; + control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!control_group) + goto out; + + /* Push some test_group2 memory into zswap */ + if (cg_enter_current(control_group)) + goto out; + control_allocation = malloc(control_allocation_size); + for (int i = 0; i < control_allocation_size; i += 4095) + control_allocation[i] = 'a'; + if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) + goto out; + + /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ + if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) + goto out; + + /* Verify that only zswapped memory from gwb_group has been written back */ + if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) + ret = KSFT_PASS; +out: + cg_enter_current(root); + if (control_group) { + cg_destroy(control_group); + free(control_group); + } + cg_destroy(wb_group); + free(wb_group); + if (control_allocation) + free(control_allocation); + return ret; +} + +struct no_kmem_bypass_child_args { + size_t target_alloc_bytes; + size_t child_allocated; +}; + +static int no_kmem_bypass_child(const char *cgroup, void *arg) +{ + struct no_kmem_bypass_child_args *values = arg; + void *allocation; + + allocation = malloc(values->target_alloc_bytes); + if (!allocation) { + values->child_allocated = true; + return -1; + } + for (long i = 0; i < values->target_alloc_bytes; i += 4095) + ((char *)allocation)[i] = 'a'; + values->child_allocated = true; + pause(); + free(allocation); + return 0; +} + +/* + * When pages owned by a memcg are pushed to zswap by kswapd, they should be + * charged to that cgroup. This wasn't the case before commit + * cd08d80ecdac("mm: correctly charge compressed memory to its memcg"). + * + * The test first allocates memory in a memcg, then raises min_free_kbytes to + * a very high value so that the allocation falls below low wm, then makes + * another allocation to trigger kswapd that should push the memcg-owned pages + * to zswap and verifies that the zswap pages are correctly charged. + * + * To be run on a VM with at most 4G of memory. + */ +static int test_no_kmem_bypass(const char *root) +{ + size_t min_free_kb_high, min_free_kb_low, min_free_kb_original; + struct no_kmem_bypass_child_args *values; + size_t trigger_allocation_size; + int wait_child_iteration = 0; + long stored_pages_threshold; + struct sysinfo sys_info; + int ret = KSFT_FAIL; + int child_status; + char *test_group = NULL; + pid_t child_pid; + + /* Read sys info and compute test values accordingly */ + if (sysinfo(&sys_info) != 0) + return KSFT_FAIL; + if (sys_info.totalram > 5000000000) + return KSFT_SKIP; + values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ | + PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (values == MAP_FAILED) + return KSFT_FAIL; + if (read_min_free_kb(&min_free_kb_original)) + return KSFT_FAIL; + min_free_kb_high = sys_info.totalram / 2000; + min_free_kb_low = sys_info.totalram / 500000; + values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + + sys_info.totalram * 5 / 100; + stored_pages_threshold = sys_info.totalram / 5 / 4096; + trigger_allocation_size = sys_info.totalram / 20; + + /* Set up test memcg */ + test_group = cg_name(root, "kmem_bypass_test"); + if (!test_group) + goto out; + + /* Spawn memcg child and wait for it to allocate */ + set_min_free_kb(min_free_kb_low); + if (cg_create(test_group)) + goto out; + values->child_allocated = false; + child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values); + if (child_pid < 0) + goto out; + while (!values->child_allocated && wait_child_iteration++ < 10000) + usleep(1000); + + /* Try to wakeup kswapd and let it push child memory to zswap */ + set_min_free_kb(min_free_kb_high); + for (int i = 0; i < 20; i++) { + size_t stored_pages; + char *trigger_allocation = malloc(trigger_allocation_size); + + if (!trigger_allocation) + break; + for (int i = 0; i < trigger_allocation_size; i += 4095) + trigger_allocation[i] = 'b'; + usleep(100000); + free(trigger_allocation); + if (get_zswap_stored_pages(&stored_pages)) + break; + if (stored_pages < 0) + break; + /* If memory was pushed to zswap, verify it belongs to memcg */ + if (stored_pages > stored_pages_threshold) { + int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); + int delta = stored_pages * 4096 - zswapped; + int result_ok = delta < stored_pages * 4096 / 4; + + ret = result_ok ? KSFT_PASS : KSFT_FAIL; + break; + } + } + + kill(child_pid, SIGTERM); + waitpid(child_pid, &child_status, 0); +out: + set_min_free_kb(min_free_kb_original); + cg_destroy(test_group); + free(test_group); + return ret; +} + +#define T(x) { x, #x } +struct zswap_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_zswap_usage), + T(test_swapin_nozswap), + T(test_zswapin), + T(test_zswap_writeback_enabled), + T(test_zswap_writeback_disabled), + T(test_no_kmem_bypass), + T(test_no_invasive_cgroup_shrink), +}; +#undef T + +static bool zswap_configured(void) +{ + return access("/sys/module/zswap", F_OK) == 0; +} + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + if (!zswap_configured()) + ksft_exit_skip("zswap isn't configured\n"); + + /* + * Check that memory controller is available: + * memory is listed in cgroup.controllers + */ + if (cg_read_strstr(root, "cgroup.controllers", "memory")) + ksft_exit_skip("memory controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) + if (cg_write(root, "cgroup.subtree_control", "+memory")) + ksft_exit_skip("Failed to set memory controller\n"); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + ksft_finished(); +} |
